@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _ import java.lang.InterruptedException import groovy.transform.Field import groovy.json.JsonSlurper import groovy.json.JsonOutput import com.nvidia.bloom.KubernetesManager import com.nvidia.bloom.Constants import com.nvidia.bloom.CloudManager import com.nvidia.bloom.SlurmConfig import com.nvidia.bloom.SlurmCluster import com.nvidia.bloom.SlurmPartition import com.nvidia.bloom.Utils import com.nvidia.bloom.ContainerRuntime import org.jenkinsci.plugins.workflow.cps.CpsThread import org.jsoup.Jsoup import org.jenkinsci.plugins.pipeline.modeldefinition.Utils as jUtils // LLM repository configuration withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LLM_REPO')]) { LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}" } LLM_ROOT = "llm" ARTIFACT_PATH = env.artifactPath ? env.artifactPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}" UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}" X86_64_TRIPLE = "x86_64-linux-gnu" AARCH64_TRIPLE = "aarch64-linux-gnu" // default package name linuxPkgName = ( env.targetArch == AARCH64_TRIPLE ? "tensorrt-llm-sbsa-release-src-" : "tensorrt-llm-release-src-" ) + (env.artifactCommit ? env.artifactCommit : env.gitlabCommit) + ".tar.gz" // Container configuration // available tags can be found in: https://urm.nvidia.com/artifactory/sw-tensorrt-docker/tensorrt-llm/ // [base_image_name]-[arch]-[os](-[python_version])-[trt_version]-[torch_install_type]-[stage]-[date]-[mr_id] LLM_DOCKER_IMAGE = env.dockerImage LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310 LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312 // DLFW torch image DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.10-py3" //Ubuntu base image UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04" UBUNTU_24_04_IMAGE = "urm.nvidia.com/docker/ubuntu:24.04" POD_TIMEOUT_SECONDS_TEST = env.podTimeoutSeconds ? env.podTimeoutSeconds : "21600" POD_TIMEOUT_SECONDS_BUILD = env.podTimeoutSeconds ? env.podTimeoutSeconds : "43200" POD_TIMEOUT_SECONDS_SLURM = env.podTimeoutSeconds ? env.podTimeoutSeconds : "79200" // Use 22 hours to allow for 2 hour of buffer. // Literals for easier access. @Field def TARNAME = "tarName" @Field def VANILLA_CONFIG = "Vanilla" @Field def SINGLE_DEVICE_CONFIG = "SingleDevice" @Field def LLVM_CONFIG = "LLVM" @Field def LINUX_AARCH64_CONFIG = "linux_aarch64" @Field def PYBIND_CONFIG = "Pybind" @Field def BUILD_CONFIGS = [ // Vanilla TARNAME is used for packaging in runLLMPackage (VANILLA_CONFIG) : [(TARNAME) : "TensorRT-LLM.tar.gz"], (SINGLE_DEVICE_CONFIG) : [(TARNAME) : "single-device-TensorRT-LLM.tar.gz"], (LLVM_CONFIG) : [(TARNAME) : "llvm-TensorRT-LLM.tar.gz"], (LINUX_AARCH64_CONFIG) : [(TARNAME) : "TensorRT-LLM-GH200.tar.gz"], (PYBIND_CONFIG) : [(TARNAME) : "pybind-TensorRT-LLM.tar.gz"], ] // TODO: Move common variables to an unified location BUILD_CORES_REQUEST = "8" BUILD_CORES_LIMIT = "8" BUILD_MEMORY_REQUEST = "48Gi" BUILD_MEMORY_LIMIT = "96Gi" BUILD_JOBS = "4" SLURM_CORES_REQUEST = "1" SLURM_CORES_LIMIT = "1" SLURM_MEMORY_REQUEST = "8Gi" SLURM_MEMORY_LIMIT = "12Gi" TESTER_CORES = "12" TESTER_MEMORY = "96Gi" CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache" MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models" // GPU types that require open driver REQUIRED_OPEN_DRIVER_TYPES = ["b100-ts2", "rtx-5080", "rtx-5090", "rtx-pro-6000", "rtx-pro-6000d"] // GPU types that don't support dynamic driver flashing REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200", "gb10x"] // ENABLE_NGC_DEVEL_IMAGE_TEST is currently disabled in the Jenkins BuildDockerImageSanityTest job config ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o TCPKeepAlive=no -o ServerAliveInterval=30 -o ServerAliveCountMax=20" def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){ withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def remote = [ ip : randomLoginNode, host : randomLoginNode, port : cluster.sshPort?:22, user : "${pipeline.USERNAME}", passwd : "${pipeline.PASSWORD}", allowAnyHosts: true, ] Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client") def hasTimeoutTest = false def downloadResultSucceed = false pipeline.stage('Submit Test Result') { sh "mkdir -p ${stageName}" // Download timeout test results def timeoutTestFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/unfinished_test.txt" def downloadTimeoutTestSucceed = Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${timeoutTestFilePath} ${stageName}/", returnStatus: true, numRetries: 3) == 0 if (downloadTimeoutTestSucceed) { sh "ls ${stageName}" def timeoutTestXml = generateTimeoutTestResultXml(stageName, "unfinished_test.txt") if (timeoutTestXml != null) { sh """ cat > ${stageName}/results-timeout.xml << 'EOF_TIMEOUT_XML' ${timeoutTestXml} EOF_TIMEOUT_XML """ hasTimeoutTest = true } } // Download normal test results def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results.xml" downloadResultSucceed = Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/", returnStatus: true, numRetries: 3) == 0 echo "hasTimeoutTest: ${hasTimeoutTest}, downloadResultSucceed: ${downloadResultSucceed}" if (hasTimeoutTest || downloadResultSucceed) { sh "ls ${stageName}" echo "Upload test results." sh "tar -czvf results-${stageName}.tar.gz ${stageName}/" ensureStageResultNotUploaded(stageName) trtllm_utils.uploadArtifacts( "results-${stageName}.tar.gz", "${UPLOAD_PATH}/test-results/" ) } else { println("No results xml to submit") } } if (hasTimeoutTest || downloadResultSucceed) { junit(allowEmptyResults: true, testResults: "${stageName}/results*.xml") } } } def runIsolatedTests(preprocessedLists, testCmdLine, llmSrc, stageName) { // Run the isolated tests one by one to avoid any potential conflicts def isolateTestList = preprocessedLists.isolate def isolateTestLines = readFile(file: isolateTestList).readLines() def rerunFailed = false for (int i = 0; i < isolateTestLines.size(); i++) { def isolateTestName = isolateTestLines[i].trim() // Create a temporary file for this single isolated test def singleTestFile = "${isolateTestList}_isolated_${i}.txt" sh "echo '${isolateTestName}' > ${singleTestFile}" sh "cat ${singleTestFile}" def isolateTestCmdLine = testCmdLine.findAll { cmd -> !cmd.contains("--test-list=") && !cmd.contains("--test-prefix=") && !cmd.contains("--csv=") && !cmd.contains("--periodic-junit-xmlpath") } isolateTestCmdLine += ["--test-list=${singleTestFile}"] isolateTestCmdLine += ["--test-prefix=${stageName}"] isolateTestCmdLine += ["--csv=${WORKSPACE}/${stageName}/report_isolated_${i}.csv"] isolateTestCmdLine += ["--periodic-junit-xmlpath ${WORKSPACE}/${stageName}/results_isolated_${i}.xml"] isolateTestCmdLine += ["--cov-append"] // Append coverage data to avoid overwriting previous data try { sh """ cd ${llmSrc}/tests/integration/defs && \ ${isolateTestCmdLine.join(" ")} """ } catch (InterruptedException e) { throw e } catch (Exception e) { def isRerunFailed = rerunFailedTests(stageName, llmSrc, isolateTestCmdLine, "results_isolated_${i}.xml", "isolated_${i}") if (isRerunFailed) { catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { error "Isolated test ${i} (${isolateTestName}) failed after rerun attempt" } // Mark that at least one isolated test failed, but continue processing other tests rerunFailed = true } } finally { // Clean up the temporary test file sh "rm -f ${singleTestFile}" } } // After processing all isolated tests, set stage failure if any test failed if (rerunFailed) { catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { error "One or more isolated tests failed after rerun attempts" } } return rerunFailed // Return the updated value } def processShardTestList(llmSrc, testDBList, splitId, splits, perfMode=false) { // Preprocess testDBList to extract ISOLATION markers echo "Preprocessing testDBList to extract ISOLATION markers..." def originalTestLines = readFile(file: testDBList).readLines() def cleanedTestLines = [] def isolationTestLines = [] originalTestLines.each { originalLine -> def trimmedLine = originalLine.trim() if (trimmedLine && trimmedLine.contains('ISOLATION')) { // Remove ISOLATION marker and nearby comma from the line def cleanedLine = trimmedLine // Handle different comma patterns around ISOLATION if (trimmedLine.contains('ISOLATION,')) { // Case: "ISOLATION,OTHER_MARKER" -> remove "ISOLATION," cleanedLine = cleanedLine.replace('ISOLATION,', '').trim() } else if (trimmedLine.contains(',ISOLATION')) { // Case: "OTHER_MARKER,ISOLATION" -> remove ",ISOLATION" cleanedLine = cleanedLine.replace(',ISOLATION', '').trim() } else { // Case: standalone "ISOLATION" -> remove " ISOLATION" cleanedLine = cleanedLine.replace(' ISOLATION', '').trim() } // Add the cleaned line to isolationTestLines if original line had ISOLATION isolationTestLines.add(cleanedLine) cleanedTestLines.add(cleanedLine) } else if (trimmedLine) { // Line doesn't contain ISOLATION, add as-is cleanedTestLines.add(originalLine.trim()) } } // Create cleaned testDBList file (without ISOLATION markers) def cleanedTestDBList = testDBList.replaceAll('\\.txt$', '_cleaned.txt') if (cleanedTestLines.size() > 0) { def cleanedContent = cleanedTestLines.join('\n') sh "echo '${cleanedContent.replace("'", "'\\''")}' > ${cleanedTestDBList}" echo "Created cleaned testDBList: ${cleanedTestDBList} with ${cleanedTestLines.size()} lines (ISOLATION markers removed)" } else { sh "touch ${cleanedTestDBList}" echo "No tests found, created empty cleaned testDBList: ${cleanedTestDBList}" } sh "cat ${cleanedTestDBList}" echo "Original testDBList contains ${isolationTestLines.size()} tests that had ISOLATION markers" def shardTestList = [] if (perfMode) { // In perfMode, skip pytest collection as it may cause errors with automatically generated testcases // Instead, use all tests from the original testDBList echo "Performance mode enabled - skipping pytest collection, using all tests from testDBList" } else { def testListCmd = [ "LLM_ROOT=${llmSrc}", "LLM_BACKEND_ROOT=${llmSrc}/triton_backend", "pytest", "--collect-only", "--splitting-algorithm least_duration", "--test-list=${cleanedTestDBList}", "--quiet", "--splits ${splits}", "--group ${splitId}" ] try { // First execute the pytest command and check if it succeeds def pytestOutput = sh( script: "cd ${llmSrc}/tests/integration/defs && ${testListCmd.join(' ')}", returnStdout: true ).trim() // Debug: Show the raw pytest output echo "<<>>" echo "${pytestOutput}" echo "<<>>" // Filter the output to get only test lines with '::' that occur after "Running X items in this shard" def lines = pytestOutput.split('\n') def foundRunningLine = false def lineIndex = 0 shardTestList = lines.findAll { line -> lineIndex++ if (line.matches(/.*Running \d+ items in this shard.*/) || line.matches(/.*\[pytest-split\] Running group.*/)) { foundRunningLine = true return false // Don't include the "Running" line itself } def hasDoubleColon = line.contains('::') def shouldInclude = foundRunningLine && hasDoubleColon return shouldInclude } echo "Filtering complete. shardTestList size: ${shardTestList.size()}" } catch (Exception e) { echo "Error: Failed to execute pytest command for test collection: ${e.getMessage()}" error "Test collection failed for shard ${splitId}/${splits}. Cannot proceed without valid test list." } } if (shardTestList || perfMode) { // Split the shard test list into regular and isolate tests def shardRegularTests = [] def shardIsolateTests = [] if (perfMode) { // In perfMode, put all tests in regular and skip isolation echo "Performance mode enabled - all tests will run as regular tests (no isolation)" shardRegularTests = cleanedTestLines.findAll { it.trim() } } else { // Process each test from shardTestList shardTestList.each { test -> def trimmedTest = test.trim() if (trimmedTest) { // Process test_unittests.py::test_unittests_v2[xxxx] pattern if (trimmedTest.startsWith('test_unittests.py::test_unittests_v2[') && trimmedTest.endsWith(']')) { // Extract content between [ and ] def startIndex = trimmedTest.indexOf('[') + 1 def endIndex = trimmedTest.lastIndexOf(']') trimmedTest = trimmedTest.substring(startIndex, endIndex) } // Check if this test is in the isolation list def isolationTestLine = isolationTestLines.find { it.contains(trimmedTest) } if (isolationTestLine) { // This test needs isolation shardIsolateTests.add(isolationTestLine) } else { // This test is a regular test - find the actual line from cleanedTestLines def cleanedTestLine = cleanedTestLines.find { it.contains(trimmedTest) } shardRegularTests.add(cleanedTestLine) } } } } // Define file paths for regular and isolate tests def regularTestList = testDBList.replaceAll('\\.txt$', '_regular.txt') def isolateTestList = testDBList.replaceAll('\\.txt$', '_isolate.txt') // Create shard-specific test files if (shardRegularTests.size() > 0) { def shardRegularContent = shardRegularTests.join('\n') sh "echo '${shardRegularContent.replace("'", "'\\''")}' > ${regularTestList}" echo "Created ${regularTestList} with ${shardRegularTests.size()} regular tests for this shard" } else { sh "touch ${regularTestList}" echo "No regular tests in this shard, created empty file: ${regularTestList}" } sh "cat ${regularTestList}" if (shardIsolateTests.size() > 0) { def shardIsolateContent = shardIsolateTests.join('\n') sh "echo '${shardIsolateContent.replace("'", "'\\''")}' > ${isolateTestList}" echo "Created ${isolateTestList} with ${shardIsolateTests.size()} isolate tests for this shard" } else { sh "touch ${isolateTestList}" echo "No isolate tests in this shard, created empty file: ${isolateTestList}" } sh "cat ${isolateTestList}" // Return preprocessed lists object for compatibility return [ regular: regularTestList, isolate: isolateTestList, regularCount: shardRegularTests.size(), isolateCount: shardIsolateTests.size() ] } else { echo "No tests found in current shard or failed to list tests" // Create empty files and preprocessed lists object def regularTestList = testDBList.replaceAll('\\.txt$', '_regular.txt') def isolateTestList = testDBList.replaceAll('\\.txt$', '_isolate.txt') sh "touch ${regularTestList}" sh "touch ${isolateTestList}" return [ regular: regularTestList, isolate: isolateTestList, regularCount: 0, isolateCount: 0 ] } } def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){ withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def remote = [ ip : randomLoginNode, host : randomLoginNode, port : cluster.sshPort, user : "${pipeline.USERNAME}", passwd : "${pipeline.PASSWORD}", allowAnyHosts: true, ] def jobWorkspace = "/home/svc_tensorrt/bloom/scripts/${jobUID}" Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client") Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job completion; sleep 30") def slurmJobID = Utils.exec( pipeline, // Try to grab the job id from ${jobWorkspace}/slurm_job_id.txt. // The slurm_run.sh will add the slurm job id in that file. script: Utils.sshUserCmd( remote, "\"cat ${jobWorkspace}/slurm_job_id.txt || true\"" ), returnStdout: true ).trim() if (!slurmJobID || !slurmJobID.isNumber()) { echo "Slurm job may not submit successfully. No job ID found." } else { Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}") Utils.exec( pipeline, script: Utils.sshUserCmd( remote, "\"scancel ${slurmJobID} || true; sacct -j ${slurmJobID} --format=JobID,JobName%100,Partition%15,Account%15,State,ExitCode,NodeList%30 || true; scontrol show job ${slurmJobID} || true\"" ) ) } Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30") def cleanupCommands = [ "rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true", "rm -rf ${jobWorkspace} || true", ].join(" ; ") Utils.exec( pipeline, script: Utils.sshUserCmd( remote, "\"${cleanupCommands}\"" ) ) Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up") } } // Methods to run Slurm job with Jenkins Agent def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, String slurmJobID) { withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def remote = [ ip : randomLoginNode, host : randomLoginNode, port : cluster.sshPort, user : "${pipeline.USERNAME}", passwd : "${pipeline.PASSWORD}", allowAnyHosts: true, ] Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30") CloudManager.destroyNode(nodeName) Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30") Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client") Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}") Utils.exec( pipeline, script: Utils.sshUserCmd( remote, "\"scancel ${slurmJobID} || true; sacct -j ${slurmJobID} --format=JobID,JobName%100,Partition%15,Account%15,State,ExitCode,NodeList%30 || true; scontrol show job ${slurmJobID} || true\"" ) ) Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30") def entrypoint = SlurmConfig.containerRuntimeToEntrypoint[cluster.containerRuntime] def cleanupCommands = [ "rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-${entrypoint} || true", "rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true", ].join(" ; ") Utils.exec( pipeline, script: Utils.sshUserCmd( remote, "\"${cleanupCommands}\"" ) ) Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up") } } def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, skipInstallWheel=false, cpver="cp312") { SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName] def entrypoint = SlurmConfig.containerRuntimeToEntrypoint[cluster.containerRuntime] // Create a unique suffix for the node name and workspace String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase() def nodeName = "${cluster.host}-test-${customSuffix}" def customWorkspace = "/tmp/${nodeName}" def nodeSecret = CloudManager.createNode(nodeName, customWorkspace) def slurmJobID = null def dockerArgs = null try { // Run ssh command to start node in desired cluster via SLURM withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def remote = [ ip : randomLoginNode, host : randomLoginNode, port : cluster.sshPort, user : "${pipeline.USERNAME}", passwd : "${pipeline.PASSWORD}", allowAnyHosts: true, ] Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client") stage('Request Node Via Slurm') { println("Selected Cluster: ${cluster.name}") def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, entrypoint) Utils.exec(pipeline, script: "cat ${jenkinsSetupPath}") Utils.copyFileToRemoteHost(pipeline, remote, jenkinsSetupPath, "/home/svc_tensorrt/bloom/scripts/${nodeName}-${entrypoint}", true) Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))") def mounts = getMountListForSlurmTest(cluster, false).join(",") def slurmSubmitOutput = Utils.exec( pipeline, timeout: false, script: Utils.sshUserCmd( remote, "\"${SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl, LLM_DOCKER_IMAGE, mounts)}\"" ), returnStdout: true, numRetries: 3 ) def jobIDs = slurmSubmitOutput .readLines() .collect { it.trim() } .collectMany { line -> def ids = [] def m1 = (line =~ /Submitted batch job (\d+)/) if (m1) ids << m1[0][1] // Extract the first captured group def m2 = (line =~ /srun: job (\d+) (queued|has been allocated)/) if (m2) ids << m2[0][1] // Extract the first captured group def m3 = (line =~ /SLURM_JOB_ID=(\d+)/) if (m3) ids << m3[0][1] // Extract the first captured group def m4 = (line =~ /SLURM_JOBID=(\d+)/) if (m4) ids << m4[0][1] // Extract the first captured group return ids } slurmJobID = jobIDs ? jobIDs[-1] : null if (!slurmJobID || !slurmJobID.isNumber()) { echo "Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}" } Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}") Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30") } } stage('Check If Node Is Online') { withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def remote = [ ip : randomLoginNode, host : randomLoginNode, port : cluster.sshPort, user : "${pipeline.USERNAME}", passwd : "${pipeline.PASSWORD}", allowAnyHosts: true, ] def counter = 0 // We submit the Slurm job with 5 hours timeout, and the K8S pod will be evicted after 22 hours. // Let's use 15 hours to check if the node is online, and with 2 hours buffer. while (!CloudManager.isNodeOnline(nodeName) && counter < 90) { // Wait 10 minutes to check status of the node again sleep(time: 10, unit: 'MINUTES') // Avoid the node being stuck in the held state. if (counter % 3 == 0) { Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\""), numRetries: 3) } counter++ // If entrypoint script fails to start, do not poll for agent connection try { SlurmConfig.checkJobStatus(pipeline, cluster, slurmJobID, remote) } catch (InterruptedException e) { throw e } catch (Exception e) { // If the exception is about job being inactive, enrich it with log path if (e.message.contains("is no longer active")) { throw new Exception("${e.message}. Check SLURM logs at /home/svc_tensorrt/slurm-logs/slurm-${slurmJobID}-${nodeName}.out on ${cluster.host}") } // Otherwise, log the error but continue (SSH might be temporarily unavailable) pipeline.echo("Warning: Could not check SLURM job status: ${e.message}") } } } if (CloudManager.isNodeOnline(nodeName)) { node(nodeName) { sh """ env | sort pwd && ls -alh ls -alh ${env.WORKSPACE} ls -alh ${env.WORKSPACE_TMP} """ sh "nproc && free -g && hostname" echoNodeAndGpuInfo(pipeline, stageName) sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m" // Use single quotes to avoid Jenkins variable expansion sh 'echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"' sh 'echo "NV_GPU: $NV_GPU"' // Dynamically set GPU arguments based on environment variables // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html // It's intentional to check NV_GPU first. dockerArgs = sh(script: """ if [ -n "\$NV_GPU" ]; then echo "--gpus '\\"device=\$NV_GPU\\"'" elif [ -n "\$CUDA_VISIBLE_DEVICES" ]; then echo "--gpus '\\"device=\$CUDA_VISIBLE_DEVICES\\"'" else echo "--gpus ${gpuCount}" fi """, returnStdout: true).trim() if (cluster.host.contains("dlcluster")) { dockerArgs += " " + sh(script: 'echo " -e NVIDIA_IMEX_CHANNELS=${NVIDIA_IMEX_CHANNELS:-0}"', returnStdout: true).trim() if (fileExists('/dev/gdrdrv')) { dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv" } } } dockerArgs = "${dockerArgs} " + "--cap-add=SYS_ADMIN " + "--ipc=host " + "--entrypoint=\"\" " + "--security-opt seccomp=unconfined " + "-u root:root " + "-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " + "-v /tmp/ccache:${CCACHE_DIR}:rw " + "-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " + "--cap-add=SYSLOG" echo "Final dockerArgs: ${dockerArgs}" } else { error "The Slurm node does not come online in the waiting period. Terminating the job." } } slurmRunner = null echo "${stageName} Slurm partition timeout: ${partition.time}" def partitionTimeout = partition?.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT if (cluster.containerRuntime.toString() == "DOCKER") { slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, partitionTimeout, true) } else if (cluster.containerRuntime.toString() == "ENROOT") { slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout) } else { throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}") } executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner) } finally { stage("Clean Up Slurm Resource") { // Workaround to handle the interruption during clean up SLURM resources retry(3) { try { cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID) } catch (Exception e) { error "Error during clean up SLURM resources: ${e.getMessage()} and retrying." } } } } } def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312", runner) { runner { // TODO: refactor the finallyRunner to reuse within slurm or nonslurm job. cacheErrorAndUploadResult(stageName, { runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver) }, { // If the execution test list is null, remove the test result xml sh """ ls -all ${stageName}/ if ! grep -q ' 1) { args += ["--segment=${nodeCount}"] } return args } def getPytestBaseCommandLine( String llmSrc, String stageName, String waivesFilePath, Boolean perfMode, String outputPath, String trtllmWheelPath, String coverageConfigFile, String pytestUtil = "", List extraArgs = [], int containerPortStart = 0, int containerPortNum = 0 ) { def extraInternalEnv = "" def pytestTestTimeout = "3600" // TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines. extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\"" // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}" // Enable NCCL debug information for multi-GPU tests extraInternalEnv += " NCCL_DEBUG=INFO" // Container port allocation environment variables for avoiding port conflicts def portEnvVars = "" if (containerPortStart > 0 && containerPortNum > 0) { portEnvVars = "CONTAINER_PORT_START=${containerPortStart} CONTAINER_PORT_NUM=${containerPortNum}" } def testCmdLine = [ "LLM_ROOT=${llmSrc}", "LLM_BACKEND_ROOT=${llmSrc}/triton_backend", "LLM_MODELS_ROOT=${MODEL_CACHE_DIR}", "MODEL_CACHE_DIR=${MODEL_CACHE_DIR}", "COLUMNS=300", extraInternalEnv, portEnvVars, pytestUtil, "pytest", "-vv", testFilter[(DETAILED_LOG)] ? "-s" : "", "--timeout-method=thread", "--apply-test-list-correction", "--timeout=${pytestTestTimeout}", "--rootdir ${llmSrc}/tests/integration/defs", "--test-prefix=${stageName}", "--waives-file=${waivesFilePath}", "--output-dir=${outputPath}/", "--csv=${outputPath}/report.csv", "-o junit_logging=out-err", "--cov=${llmSrc}/examples/", "--cov=${llmSrc}/tensorrt_llm/", "--cov=${trtllmWheelPath}/tensorrt_llm/", "--cov-report=", "--cov-config=${coverageConfigFile}", "--periodic-junit", "--periodic-junit-xmlpath ${outputPath}/results.xml", "--periodic-batch-size=1", "--periodic-save-unfinished-test", ] if (perfMode) { testCmdLine += [ "--perf", "--perf-log-formats csv", "--perf-log-formats yaml", "--enable-gpu-clock-lock" ] } if (stageName.contains("-Ray-")) { testCmdLine += ["--run-ray"] } if (extraArgs) { testCmdLine += extraArgs } return testCmdLine as String[] } def getMountListForSlurmTest(SlurmCluster cluster, boolean useSbatch = false) { def mounts = [] // mounts for SLURM job submission and logs if (useSbatch) { mounts += [ "/home/svc_tensorrt/bloom/scripts", ] } else { mounts += [ "/home/svc_tensorrt/bloom/scripts", "/home/svc_tensorrt/slurm-logs", ] } // data/cache mounts if (cluster.containerRuntime.toString() == "DOCKER") { mounts += [ "/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro", ] } else if (cluster.containerRuntime.toString() == "ENROOT") { if (!cluster.scratchPath) { throw new Exception("Scratch path is not set for cluster: ${cluster.name}") } mounts += [ "${cluster.scratchPath}:/scratch.trt_llm_data:ro", "/home/svc_tensorrt/.cache:/root/.cache", ] } else { throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}") } return mounts } def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, skipInstallWheel=false, cpver="cp312") { SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName] // Create a unique suffix for the job name String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase() def jobUID = "${cluster.host}-multi_node_test-${customSuffix}" def disaggMode = stageName.contains("PerfSanity-Disagg") Utils.exec(pipeline, script: "env | sort && pwd && ls -alh") try { // Run ssh command to start node in desired cluster via SLURM withCredentials([ usernamePassword( credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD' ) ]) { def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def remote = [ ip : randomLoginNode, host : randomLoginNode, port : cluster.sshPort, user : "${pipeline.USERNAME}", passwd : "${pipeline.PASSWORD}", allowAnyHosts: true, ] Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client") def tarName = BUILD_CONFIGS[config][TARNAME] def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}" def llmPath = sh (script: "realpath .", returnStdout: true).trim() def jobWorkspace = "/home/svc_tensorrt/bloom/scripts/${jobUID}" def resourcePathNode = "/tmp" def llmSrcNode = "${resourcePathNode}/TensorRT-LLM/src" def llmSrcLocal = "${llmPath}/TensorRT-LLM/src" def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh" def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh" def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_install.sh" def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh" def scriptBashUtilsLocalPath = "${llmSrcLocal}/jenkins/scripts/bash_utils.sh" def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh" def testListPathNode = "${jobWorkspace}/${testList}.txt" def waivesListPathNode = "${jobWorkspace}/waives.txt" def slurmJobLogPath = "${jobWorkspace}/job-output.log" def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh") def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh" def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh") def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh" def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh") def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh" def coverageConfigFile = "${jobWorkspace}/.coveragerc" stage("Initialize Test") { println("Selected Cluster: ${cluster.name}") // Create Job Workspace folder in Frontend Node Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mkdir -p ${jobWorkspace}\""), numRetries: 3) // Download and Unzip Tar File trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}") sh "cd ${llmPath} && tar -zxf ${BUILD_CONFIGS[config][TARNAME]}" Utils.exec(pipeline, script: "echo \"Script for Slurm srun job to submit: \" && cat ${scriptRunLocalPath}") Utils.copyFileToRemoteHost( pipeline, remote, scriptRunLocalPath, scriptRunPathNode, true ) Utils.exec(pipeline, script: "echo \"Script to install TensorRT LLM dependencies: \" && cat ${scriptInstallLocalPath}") Utils.copyFileToRemoteHost( pipeline, remote, scriptInstallLocalPath, scriptInstallPathNode, true ) Utils.exec(pipeline, script: "echo \"Script for Bash utilities: \" && cat ${scriptBashUtilsLocalPath}") Utils.copyFileToRemoteHost( pipeline, remote, scriptBashUtilsLocalPath, scriptBashUtilsPathNode, true ) // Generate Test List and Upload to Frontend Node def makoArgs = getMakoArgsFromStageName(stageName, true) // TODO: currently the options will only be processed if the first // line is "Mako options:", maybe we can make it more generic, which // if the line cannot be split by "=", just ignore that line. def makoOptsJson = transformMakoArgsToJson(["Mako options:"] + makoArgs) def testListPathLocal = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson) Utils.copyFileToRemoteHost( pipeline, remote, testListPathLocal, testListPathNode ) // Download and Merge waives.txt mergeWaivesTxt(pipeline, llmSrcLocal, stageName) // Add passed test list from previous pipeline run to the waives.txt if (testFilter[(REUSE_TEST)] != false) { reusePassedTestResults(llmSrcLocal, stageName, "${llmSrcLocal}/tests/integration/test_lists/waives.txt") } Utils.copyFileToRemoteHost( pipeline, remote, "${llmSrcLocal}/tests/integration/test_lists/waives.txt", waivesListPathNode ) // generate .coveragerc in workspace and add file path to pytest command sh """ touch ./.coveragerc echo '[run]' > ./.coveragerc echo 'branch = True' >> ./.coveragerc echo 'data_file = ${jobWorkspace}/.coverage.${stageName}' >> ./.coveragerc echo '[paths]' >> ./.coveragerc echo 'source =\n ${llmSrcNode}/tensorrt_llm/\n ---wheel_path---/tensorrt_llm//tensorrt_llm/' >> ./.coveragerc cat ./.coveragerc """ Utils.copyFileToRemoteHost( pipeline, remote, "./.coveragerc", coverageConfigFile ) // Generate Pytest command String pytestUtil = "" if (nodeCount > 1) { pytestUtil = "$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch" } def pytestCommand = getPytestBaseCommandLine( llmSrcNode, stageName, waivesListPathNode, perfMode, jobWorkspace, "__PLACEHOLDER_TRTLLM_WHL_PATH__", "$jobWorkspace/.coveragerc", pytestUtil, [ "--test-list=$testListPathNode", "--splitting-algorithm least_duration", "--splits $splits", "--group $splitId" ] ).join(" ") // Generate Job Launch Script def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#") def mounts = getMountListForSlurmTest(cluster, true).join(",") String[] taskArgs = getNodeArgs(nodeCount, gpuCount, disaggMode) if (taskArgs == null) { error "Invalid Slurm test stage name is set" } taskArgs = [ *taskArgs, ] def containerImageArg = container def srunPrologue = "" if (cluster.containerRuntime.toString() == "ENROOT") { def enrootImagePath = "${cluster.scratchPath}/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh" containerImageArg = enrootImagePath srunPrologue = """ export ENROOT_CACHE_PATH='/home/svc_tensorrt/.cache/enroot' importContainerWithRetries() { local docker_uri=\$1 local output_path=\$2 local max_attempts=\${3:-3} local delay=\${4:-60} local attempt=1 rm -f "\$output_path" until enroot import -o "\$output_path" -- "docker://\$docker_uri" do if ((attempt >= max_attempts)) then echo "enroot import failed after \$max_attempts attempts" return 1 fi echo "enroot import failed (attempt \$attempt of \$max_attempts). Retrying in \${delay}s..." rm -f "\$output_path" sleep \$delay ((attempt++)) done } importContainerWithRetries "$container" "$enrootImagePath" """.replaceAll("(?m)^\\s*", "") } // Define environment variables to export def envVarNames = [ 'OPEN_SEARCH_DB_BASE_URL', 'OPEN_SEARCH_DB_CREDENTIALS_USR', 'OPEN_SEARCH_DB_CREDENTIALS_PSW', 'BUILD_ID', 'BUILD_URL', 'JOB_NAME', 'globalVars', 'gitlabCommit' ] def envVarsToExport = [:] envVarNames.each { varName -> envVarsToExport[varName] = env."${varName}" } srunArgs = [ "--container-name=multi_node_test-\${SLURM_JOB_ID}", "--container-image=$containerImageArg", "--container-workdir=$jobWorkspace", "--container-mounts=$mounts", "--container-env=NVIDIA_IMEX_CHANNELS" ] envVarsToExport.each { varName, varValue -> srunArgs.add("--container-env=${varName}") } def exemptionComment = "" if (cluster.host.contains("oci-nrt") || cluster.host.contains("oci-hsg") || cluster.host.contains("lbd-lax")) { exemptionComment = """--comment='{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"90","reason":"other","description":"Long data and model loading time and disaggregated serving tests"}}'""" } def envExportStatements = envVarsToExport.collect { varName, varValue -> def escapedValue = varValue?.toString() ?: '' escapedValue = escapedValue .replace('\\', '\\\\') // Backslash .replace('"', '\\"') // Double quote .replace('$', '\\$') // Dollar sign (prevent variable expansion) .replace('`', '\\`') // Backtick (prevent command substitution) "export ${varName}=\"${escapedValue}\"" }.join('\n') def scriptLaunchPrefix = """#!/bin/bash #SBATCH ${exemptionComment} #SBATCH --output=${slurmJobLogPath} ${taskArgs.collect { "#SBATCH $it" }.join('\n')} #SBATCH ${partition.additionalArgs} ${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"} ${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""} # SBATCH directives must appear before any executable commands. set -xEeuo pipefail trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR echo "Starting Slurm job \$SLURM_JOB_ID on \$SLURM_NODELIST" export jobWorkspace=$jobWorkspace export tarName=$tarName export llmTarfile=$llmTarfile export llmSrcNode=$llmSrcNode export stageName=$stageName export perfMode=$perfMode export resourcePathNode=$resourcePathNode export pytestCommand="$pytestCommand" export coverageConfigFile="$coverageConfigFile" export NVIDIA_IMEX_CHANNELS=\${NVIDIA_IMEX_CHANNELS:-0} export NVIDIA_VISIBLE_DEVICES=\${NVIDIA_VISIBLE_DEVICES:-\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=csv,noheader)-1)))} ${envExportStatements} echo "Env NVIDIA_IMEX_CHANNELS: \$NVIDIA_IMEX_CHANNELS" echo "Env NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES" ${srunPrologue} """.replaceAll("(?m)^\\s*", "") if (disaggMode) { if(nodeCount > 1) { srunArgs.add("--mpi=pmix") } def scriptLaunchPrefixPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch_prefix.sh") def scriptLaunchSrunArgsPathLocal = Utils.createTempLocation(pipeline, "./slurm_srun_args.txt") def scriptLaunchDraftPathLocal = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh" def scriptSubmitLocalPath = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/submit.py" pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix) pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" ")) // Output is the corresponding scriptLaunchPathLocal script under the disaggMode sh """ python3 ${scriptSubmitLocalPath} \\ --run-ci \\ --llm-src ${llmSrcLocal} \\ --test-list ${testListPathLocal} \\ --draft-launch-sh ${scriptLaunchDraftPathLocal} \\ --launch-sh ${scriptLaunchPathLocal} \\ --run-sh ${scriptRunPathNode} \\ --install-sh ${scriptInstallPathNode} \\ --script-prefix ${scriptLaunchPrefixPathLocal} \\ --srun-args ${scriptLaunchSrunArgsPathLocal} """ } else { if(nodeCount > 1) { srunArgs.add("--mpi=pmi2") } def scriptContent = """ ${scriptLaunchPrefix} srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode} """.replaceAll("(?m)^\\s*", "") pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent) } Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job to submit: \" && cat ${scriptLaunchPathLocal}") Utils.copyFileToRemoteHost( pipeline, remote, scriptLaunchPathLocal, scriptLaunchPathNode, true ) def filesToKeepWhenRetry = [ scriptRunPathNode, scriptInstallPathNode, scriptBashUtilsPathNode, scriptLaunchPathNode, scriptSubmitPathNode, scriptTrackPathNode, testListPathNode, waivesListPathNode, coverageConfigFile ] def findKeepWhenRetryArgs = filesToKeepWhenRetry.collect { " ! -name \"\$(basename \"${it}\")\"" }.join("") def scriptSubmit = """#!/bin/bash set -xEeuo pipefail trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR # Clean up previous job intermediate files so that retry can work if [ -f "${jobWorkspace}/slurm_job_id.txt" ]; then previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt") echo "Found previous Slurm job ID: \${previous_job_id}" scancel "\${previous_job_id}" || true # Wait for 120 seconds to ensure the previous job is canceled sleep 120 fi # Clean up workspace: remove all files/dirs not in the keep list find "${jobWorkspace}" -maxdepth 1 -mindepth 1 ${findKeepWhenRetryArgs} -exec rm -rf {} + touch ${slurmJobLogPath} jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}') if [ -z "\$jobId" ]; then echo "Error: Slurm job submission failed, no job ID returned." exit 1 fi echo "Submitted Slurm job \$jobId" # Save Slurm job ID for later steps to retrieve echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt" """.replaceAll("(?m)^\\s*", "").trim() pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit) Utils.exec(pipeline, script: "echo \"Script to submit the final Slurm job: \" && cat ${scriptSubmitPathLocal}") Utils.copyFileToRemoteHost( pipeline, remote, scriptSubmitPathLocal, scriptSubmitPathNode, true ) } stage("[${stageName}] Run Pytest") { // Submit the Slurm job Utils.exec( pipeline, timeout: false, script: Utils.sshUserCmd( remote, scriptSubmitPathNode ), numRetries: 3 ) def slurmJobId = Utils.exec( pipeline, script: Utils.sshUserCmd( remote, "\"cat ${jobWorkspace}/slurm_job_id.txt\"" ), returnStdout: true, numRetries: 3 ).trim() Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobId}") def scriptTrack = """#!/bin/bash set -xEeuo pipefail trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR jobId=${slurmJobId} tail -f ${slurmJobLogPath} & tailPid=\$! # Wait until Slurm job is done while true; do # Use --allocations to ensure we match the exact job ID and not job steps (like 123.batch, 123.0) STATUS=\$(sacct -j \$jobId --format=State -Pn --allocations) if [[ -z \$STATUS || \$STATUS == "RUNNING" || \$STATUS == "PENDING" || \$STATUS == "CONFIGURING" ]]; then echo "Slurm job \$jobId is still running" sleep 300 else echo "Slurm job \$jobId finished with state: \$STATUS" break fi done # Kill tail -f process kill \$tailPid # Wait briefly to ensure accounting is consistent sleep 10 # Get exit code (STATUS is already known from loop break) # Retry for exit code if missing for i in {1..3}; do # Use awk to parse exit code from format like "0:0" EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}') if [ -n "\$EXIT_CODE" ]; then break fi echo "Waiting for sacct exit code to update... attempt \$i" sleep 10 done if [ -z "\$EXIT_CODE" ]; then echo "Error: Failed to get exit code from sacct after retries, defaulting to 1." EXIT_CODE=1 fi # We already have valid STATUS from the loop that caused the break if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then echo "Pytest succeed in Slurm job \$jobId" echo "Status: \$STATUS | Exit_code \$EXIT_CODE" exit 0 else echo "Pytest failed in Slurm job \$jobId" echo "Status: \$STATUS | Exit_code \$EXIT_CODE" exit 1 fi """.replaceAll("(?m)^\\s*", "").trim() pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack) Utils.exec(pipeline, script: "echo \"Script to track Slurm job and pull the log: \" && cat ${scriptTrackPathLocal}") Utils.copyFileToRemoteHost( pipeline, remote, scriptTrackPathLocal, scriptTrackPathNode, true ) // Track the Slurm job Utils.exec( pipeline, timeout: false, script: Utils.sshUserCmd( remote, scriptTrackPathNode ), numRetries: 3 ) } echo "Finished test stage execution." } } finally { uploadResults(pipeline, cluster, jobUID, stageName) stage("Clean Up Slurm Resource") { // Workaround to handle the interruption during clean up SLURM resources retry(3) { try { cleanUpSlurmResources(pipeline, cluster, jobUID) } catch (Exception e) { error "Error during clean up SLURM resources: ${e.getMessage()} and retrying." } } } } } def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, runWithSbatch=false, skipInstallWheel=false, cpver="cp312") { echo "Run Slurm job with native sbatch: $runWithSbatch" if (nodeCount > 1 || runWithSbatch) { runLLMTestlistWithSbatch(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, nodeCount, skipInstallWheel, cpver) } else { runLLMTestlistWithAgent(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, skipInstallWheel, cpver) } } def trimForStageList(stageNameList) { if (stageNameList == null) { return null } trimedList = [] stageNameList.each { stageName -> trimedList.add(stageName.trim().replaceAll('\\\\', '')) } return trimedList } // Test filter flags @Field def REUSE_TEST = "reuse_test" @Field def REUSE_STAGE_LIST = "reuse_stage_list" @Field def ENABLE_SKIP_TEST = "skip_test" @Field def TEST_STAGE_LIST = "stage_list" @Field def GPU_TYPE_LIST = "gpu_type" @Field def TEST_BACKEND = "test_backend" @Field def IS_POST_MERGE = "post_merge" @Field def ADD_MULTI_GPU_TEST = "add_multi_gpu_test" @Field def ONLY_MULTI_GPU_TEST = "only_multi_gpu_test" @Field def DISABLE_MULTI_GPU_TEST = "disable_multi_gpu_test" @Field def EXTRA_STAGE_LIST = "extra_stage" @Field def MULTI_GPU_FILE_CHANGED = "multi_gpu_file_changed" @Field def ONLY_ONE_GROUP_CHANGED = "only_one_group_changed" @Field def AUTO_TRIGGER_TAG_LIST = "auto_trigger_tag_list" @Field def DEBUG_MODE = "debug" @Field def DETAILED_LOG = "detailed_log" @Field def testFilter = [ (REUSE_TEST): null, (REUSE_STAGE_LIST): null, (ENABLE_SKIP_TEST): false, (TEST_STAGE_LIST): null, (GPU_TYPE_LIST): null, (TEST_BACKEND): null, (IS_POST_MERGE): false, (ADD_MULTI_GPU_TEST): false, (ONLY_MULTI_GPU_TEST): false, (DISABLE_MULTI_GPU_TEST): false, (EXTRA_STAGE_LIST): null, (MULTI_GPU_FILE_CHANGED): false, (ONLY_ONE_GROUP_CHANGED): "", (DEBUG_MODE): false, (AUTO_TRIGGER_TAG_LIST): [], (DETAILED_LOG): false, ] @Field def GITHUB_PR_API_URL = "github_pr_api_url" @Field def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list" @Field def ACTION_INFO = "action_info" @Field def IMAGE_KEY_TO_TAG = "image_key_to_tag" def globalVars = [ (GITHUB_PR_API_URL): null, (CACHED_CHANGED_FILE_LIST): null, (ACTION_INFO): null, (IMAGE_KEY_TO_TAG): [:], ] class GlobalState { static def uploadResultStageNames = [] // HOST_NODE_NAME to starting port section map // This map maintains the next available starting port for each host node // to avoid port conflicts when running parallel tests on the same node. // Key: HOST_NODE_NAME (e.g., "node-01.cluster.local") // Value: Next available starting port number for that node static def hostNodePortMap = [:] // Port allocation configuration static final int BASE_PORT = 10000 // Base starting port static final int PORT_SECTION_SIZE = 1000 // Number of ports per section/stage static final int MAX_PORT = 32000 // Maximum port number to avoid system ports } /** * Allocates and returns a starting port section for the given host node. * This function is thread-safe and ensures each stage running on the same * host node gets a unique port range to avoid conflicts. * * @param hostNodeName The HOST_NODE_NAME of the node running the stage * @param stageName Optional stage name for logging purposes * @return The starting port number for this stage's port section */ def getStartingPortForHost(String hostNodeName, String stageName = "") { lock(resource: 'globalstate-hostNodePortMap') { def startingPort = GlobalState.hostNodePortMap.get(hostNodeName, GlobalState.BASE_PORT) // Store the next available starting port for this host def nextPort = startingPort + GlobalState.PORT_SECTION_SIZE // Wrap around if we exceed MAX_PORT if (nextPort > GlobalState.MAX_PORT) { nextPort = GlobalState.BASE_PORT } GlobalState.hostNodePortMap[hostNodeName] = nextPort return startingPort } } /** * Gets the HOST_NODE_NAME from the current environment. * Falls back to hostname if HOST_NODE_NAME is not set. * * @return The host node name */ def getHostNodeName() { return sh(script: ''' if [ -n "$HOST_NODE_NAME" ]; then echo "$HOST_NODE_NAME" else hostname -f || hostname fi ''', returnStdout: true).trim() } String getShortenedJobName(String path) { static final nameMapping = [ "L0_MergeRequest": "l0-mr", "L0_Custom": "l0-cus", "L0_PostMerge": "l0-pm", "L0_PostMergeDocker": "l0-pmd", "L1_Custom": "l1-cus", "L1_Nightly": "l1-nt", "L1_Stable": "l1-stb", "BuildDockerImageSanityTest": "img-check", ] def parts = path.split('/') // Apply nameMapping to the last part (jobName) def jobName = parts[-1] boolean replaced = false nameMapping.each { key, value -> if (jobName.contains(key)) { jobName = jobName.replace(key, value) replaced = true } } if (!replaced) { jobName = jobName.length() > 7 ? jobName.substring(0, 7) : jobName } // Replace the last part with the transformed jobName parts[-1] = jobName // Rejoin the parts with '-', convert to lowercase return parts.join('-').toLowerCase() } def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSuccess=false, postTag="") { checkStageName([stageName]) def Boolean stageIsInterrupted = false def Boolean stageIsFailed = true try { taskRunner() stageIsFailed = false } catch (InterruptedException e) { stageIsInterrupted = true throw e } finally { ensureStageResultNotUploaded(stageName + postTag) if (stageIsInterrupted) { echo "Stage is interrupted, skip to upload test result." } else { // Temporarily disable to reduce the log size // sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi' if (noResultIfSuccess && !stageIsFailed) { // Clean up the workspace sh """ env | sort pwd && ls -alh rm -rf ./* """ echo "Finished test stage execution." return } echo "noResultIfSuccess: ${noResultIfSuccess}, stageIsFailed: ${stageIsFailed}" sh "mkdir -p ${stageName}" finallyRunner() if (stageIsFailed) { def timeoutTestXml = generateTimeoutTestResultXml(stageName, "unfinished_test.txt") if (timeoutTestXml != null) { sh """ cat > ${stageName}/results-timeout.xml << 'EOF_TIMEOUT_XML' ${timeoutTestXml} EOF_TIMEOUT_XML """ } def stageXml = generateStageFailTestResultXml(stageName, "Stage Failed", "Stage run failed without result", "results*.xml") if (stageXml != null) { sh "echo '${stageXml}' > ${stageName}/results-stage.xml" } } sh "STAGE_NAME=${stageName}" sh "STAGE_NAME=${stageName} && env | sort > ${stageName}/debug_env.txt" echo "Upload test results." sh "tar -czvf results-${stageName}${postTag}.tar.gz ${stageName}/" trtllm_utils.uploadArtifacts( "results-${stageName}${postTag}.tar.gz", "${UPLOAD_PATH}/test-results/" ) junit(testResults: "${stageName}/results*.xml") } // Clean up the workspace sh """ env | sort pwd && ls -alh rm -rf ./* """ echo "Finished test stage execution." } } def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMode = false) { def targetCloud = "kubernetes-cpu" def selectors = """ nvidia.com/node_type: builder kubernetes.io/arch: ${arch} kubernetes.io/os: linux""" def containerConfig = "" def nodeLabelPrefix = "" def jobName = getShortenedJobName(env.JOB_NAME) def buildID = env.BUILD_ID def tolerations = "" def extraDeviceEnv = "" def archSuffix = arch == "arm64" ? "arm" : "amd" def jnlpImage = "urm.nvidia.com/sw-ipp-blossom-sre-docker-local/lambda/custom_jnlp_images_${archSuffix}_linux:jdk17" switch(type) { case "agent": containerConfig = """ - name: alpine image: urm.nvidia.com/docker/alpine:latest command: ['cat'] tty: true resources: requests: cpu: '2' memory: 10Gi ephemeral-storage: 25Gi limits: cpu: '2' memory: 10Gi ephemeral-storage: 25Gi imagePullPolicy: Always""" nodeLabelPrefix = "cpu" break case "slurm": containerConfig = """ - name: trt-llm image: ${image} command: ['sleep', ${POD_TIMEOUT_SECONDS_SLURM}] tty: true resources: requests: cpu: ${SLURM_CORES_REQUEST} memory: ${SLURM_MEMORY_REQUEST} ephemeral-storage: 100Gi limits: cpu: ${SLURM_CORES_LIMIT} memory: ${SLURM_MEMORY_LIMIT} ephemeral-storage: 100Gi imagePullPolicy: Always""" nodeLabelPrefix = "cpu" break case "build": containerConfig = """ - name: trt-llm image: ${image} command: ['sleep', ${POD_TIMEOUT_SECONDS_BUILD}] volumeMounts: - name: sw-tensorrt-pvc mountPath: "/mnt/sw-tensorrt-pvc" readOnly: false tty: true resources: requests: cpu: ${BUILD_CORES_REQUEST} memory: ${BUILD_MEMORY_REQUEST} ephemeral-storage: 200Gi limits: cpu: ${BUILD_CORES_LIMIT} memory: ${BUILD_MEMORY_LIMIT} ephemeral-storage: 200Gi imagePullPolicy: Always""" nodeLabelPrefix = "cpu" break default: def hasMultipleGPUs = (gpuCount > 1) def memorySize = "${TESTER_MEMORY}" def storageSize = "300Gi" def driverVersion = REQUIRED_OPEN_DRIVER_TYPES.any { type.contains(it) } ? Constants.DEFAULT_NVIDIA_OPEN_DRIVER_VERSION : Constants.DEFAULT_NVIDIA_DRIVER_VERSION def cpuCount = "${TESTER_CORES}" if (hasMultipleGPUs) { // Not a hard requirement, but based on empirical values. memorySize = "${gpuCount * 150}" + "Gi" storageSize = "${gpuCount * 150}" + "Gi" cpuCount = "${gpuCount * 12}" } def gpuType = KubernetesManager.selectGPU(type) nodeLabelPrefix = type targetCloud = "kubernetes" // DGX Spark requires a special setting for accessing the device. // It has 128GB unified memory as per spec. Use half of the memory at the CPU side. if (type.contains("gb10x")) { targetCloud = "nvks-sparks-cloud" memorySize = "64Gi" tolerations = """ tolerations: - key: "node_for_blossom_trt" operator: "Exists" effect: "NoSchedule" """ extraDeviceEnv = """ - name: NVIDIA_VISIBLE_DEVICES value: "all" - name: NVIDIA_DRIVER_CAPABILITIES value: "compute,utility" """ } // The following GPU types doesn't support dynamic driver flashing. if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) { if (type.contains("gb10x")) { selectors = """ kubernetes.io/arch: ${arch} kubernetes.io/os: linux nvidia.com/gpu.machine: NVIDIA_DGX_Spark nvidia.com/tenant: blossom_trt""" } else { selectors = """ kubernetes.io/arch: ${arch} kubernetes.io/os: linux nvidia.com/gpu_type: ${gpuType}""" } } else if (perfMode && !hasMultipleGPUs) { // Use single GPU machine with "tensorrt/test_type: perf" for stable perf testing. // H100 / A100 single GPU machine has this unique label in TensorRT Blossom pool. selectors = """ kubernetes.io/arch: ${arch} kubernetes.io/os: linux nvidia.com/gpu_type: ${gpuType} nvidia.com/driver_version: '${driverVersion}' tensorrt/test_type: perf""" } else { selectors = """ kubernetes.io/arch: ${arch} kubernetes.io/os: linux nvidia.com/gpu_type: ${gpuType} nvidia.com/driver_version: '${driverVersion}'""" } containerConfig = """ - name: trt-llm image: ${image} command: ['sleep', ${POD_TIMEOUT_SECONDS_TEST}] tty: true resources: requests: cpu: ${cpuCount} memory: ${memorySize} nvidia.com/gpu: ${gpuCount} ephemeral-storage: ${storageSize} limits: cpu: ${cpuCount} memory: ${memorySize} nvidia.com/gpu: ${gpuCount} ephemeral-storage: ${storageSize} imagePullPolicy: Always volumeMounts: - name: dshm mountPath: /dev/shm - name: scratch-trt-llm-data mountPath: /scratch.trt_llm_data readOnly: true - name: sw-tensorrt-pvc mountPath: "/mnt/sw-tensorrt-pvc" readOnly: false securityContext: capabilities: add: - SYS_ADMIN""" break } def nodeLabel = trtllm_utils.appendRandomPostfix("${nodeLabelPrefix}---tensorrt-${jobName}-${buildID}") def pvcVolume = """ - name: sw-tensorrt-pvc persistentVolumeClaim: claimName: sw-tensorrt-pvc """ if (arch == "arm64") { // PVC mount isn't supported on aarch64 platform. Use NFS as a WAR. pvcVolume = """ - name: sw-tensorrt-pvc nfs: server: 10.117.145.13 path: /vol/scratch1/scratch.svc_tensorrt_blossom """ } def llmModelVolume = """ - name: scratch-trt-llm-data nfs: server: 10.117.145.14 path: /vol/scratch1/scratch.michaeln_blossom """ if (type.contains("6000d") || type.contains("gh200")) { // rtx-pro-6000d and gh200 nodes are located in Austin DC, we use the FlexCache to speed up the data access. llmModelVolume = """ - name: scratch-trt-llm-data nfs: server: 10.20.162.212 path: /vol/scratch26/scratch.trt_llm_data """ } def podConfig = [ cloud: targetCloud, namespace: "sw-tensorrt", label: nodeLabel, yaml: """ apiVersion: v1 kind: Pod spec: qosClass: Guaranteed affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: "tensorrt/taints" operator: DoesNotExist - key: "tensorrt/affinity" operator: NotIn values: - "core" nodeSelector: ${selectors} containers: ${containerConfig} env: - name: HOST_NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName ${extraDeviceEnv} - name: jnlp image: ${jnlpImage} args: ['\$(JENKINS_SECRET)', '\$(JENKINS_NAME)'] resources: requests: cpu: '2' memory: 10Gi ephemeral-storage: 25Gi limits: cpu: '2' memory: 10Gi ephemeral-storage: 25Gi qosClass: Guaranteed volumes: - name: dshm emptyDir: medium: Memory ${llmModelVolume} ${pvcVolume} ${tolerations} """.stripIndent(), ] return podConfig } def echoNodeAndGpuInfo(pipeline, stageName) { String hostNodeName = sh(script: ''' if [ -n "$HOST_NODE_NAME" ]; then echo "$HOST_NODE_NAME" else hostname -f || hostname fi ''', returnStdout: true).trim() String gpuUuids = pipeline.sh(script: "nvidia-smi -q | grep \"GPU UUID\" | awk '{print \$4}' | tr '\n' ',' || true", returnStdout: true) pipeline.echo "HOST_NODE_NAME = ${hostNodeName} ; GPU_UUIDS = ${gpuUuids} ; STAGE_NAME = ${stageName}" } def runLLMDocBuild(pipeline, config) { // Step 1: cloning source code sh "pwd && ls -alh" sh "env | sort" // allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail trtllm_utils.checkoutSource(LLM_REPO, env.gitlabCommit, LLM_ROOT, true, true) sh "mkdir TensorRT-LLM" sh "cp -r ${LLM_ROOT}/ TensorRT-LLM/src/" trtllm_utils.llmExecStepWithRetry(pipeline, script: "git config --global --add safe.directory \"*\"") def llmPath = sh (script: "realpath .", returnStdout: true).trim() def llmSrc = "${llmPath}/TensorRT-LLM/src" // Step 2: download TRT-LLM tarfile def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${BUILD_CONFIGS[config][TARNAME]}" trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}") sh "cd ${llmPath} && tar -zxf ${BUILD_CONFIGS[config][TARNAME]}" // install python package if (env.alternativeTRT) { sh "cd ${llmSrc} && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt" } trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && pip3 install -r requirements-dev.txt") trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl") // Step 3: build doc trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y doxygen python3-pip graphviz") def containerPATH = sh(script: "echo \${PATH}", returnStdout: true).replaceAll("\\s", "") if (!containerPATH.contains("/usr/local/bin:")) { echo "Prepend /usr/local/bin into \${PATH}" containerPATH = "/usr/local/bin:${containerPATH}" } containerPATH = containerPATH.replaceAll(':+$', '') withEnv(["PATH=${containerPATH}"]) { sh "env | sort" sh "rm -rf ${LLM_ROOT}/docs/build" trtllm_utils.llmExecStepWithRetry( pipeline, script: """ cd ${LLM_ROOT}/docs && \ pip3 install -r requirements.txt && \ pip3 install git+https://github.com/sphinx-doc/sphinx.git@v7.4.7 && \ doxygen Doxygen && \ make html && \ cd build/html && \ touch .nojekyll """ ) } echo "Upload built html." sh "tar -czvf doc-html-preview.tar.gz ${LLM_ROOT}/docs/build/html" trtllm_utils.uploadArtifacts( "doc-html-preview.tar.gz", "${UPLOAD_PATH}/test-results/" ) } def launchTestListCheck(pipeline) { stageName = "Test List Check" trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(LLM_DOCKER_IMAGE, "a10"), "trt-llm", { try { echoNodeAndGpuInfo(pipeline, stageName) trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y libffi-dev") sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m" // download TRT-LLM tarfile def tarName = BUILD_CONFIGS[VANILLA_CONFIG][TARNAME] def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}" trtllm_utils.llmExecStepWithRetry(pipeline, script: "pwd && wget -nv ${llmTarfile} && ls -alh") sh "tar -zxf ${tarName}" def llmPath = sh (script: "realpath .", returnStdout: true).trim() def llmSrc = "${llmPath}/TensorRT-LLM/src" trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install -r ${llmSrc}/requirements-dev.txt") sh "NVIDIA_TRITON_SERVER_VERSION=25.10 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive" } catch (InterruptedException e) { throw e } catch (Exception e) { throw e } }) } def generateTimeoutTestResultXml(stageName, testFilePath) { if (!fileExists("${stageName}/${testFilePath}")) { echo "No ${testFilePath} found in ${stageName}, skipping timeout XML generation" return null } String timeoutTests = sh(script: "cd ${stageName} && cat ${testFilePath}", returnStdout: true).trim() echo "timeoutTests: ${timeoutTests}" if (timeoutTests == null || timeoutTests == "") { return null } def testList = timeoutTests.split("\n") String xmlContent = """ """ testList.each { test -> xmlContent += """ Test terminated unexpectedly """ } xmlContent += "" return xmlContent } def generateStageFailTestResultXml(stageName, subName, failureLog, resultPath) { String resultFiles = sh(script: "cd ${stageName} && ls -l ${resultPath} | wc -l", returnStdout: true).trim() echo "resultFiles: ${resultFiles}" if (resultFiles != "0") { return null } return """ ${failureLog} """ } def transformMakoArgsToJson(optList) { def makoOpts = [:] def startedMakoOpts = false def param = null def value = null optList.each { val -> if (startedMakoOpts) { // Handle case where value is missing param = null value = null try { (param, value) = val.split("=") } catch (ArrayIndexOutOfBoundsException ex) { param = val.split("=")[0] value = null } // Try to convert nulls, booleans, and floats into the correct type if (value != null) { if (value.toLowerCase() == "none") { echo "Converted mako param '${param}' value '${value}' to 'null'" value = null } else if (value.toLowerCase() in ["true", "false"]) { echo "Converted mako param '${param}' value '${value}' to Boolean '${value.toBoolean()}'" value = value.toBoolean() } } makoOpts[(param)] = value } if (val.equals("Mako options:")) { startedMakoOpts = true } } def makoOptsJson = JsonOutput.toJson(makoOpts) // Print and return the Test DB Query as a JSON string echo "Test DB Mako opts: ${makoOptsJson}" return makoOptsJson } def getMakoOpts(getMakoScript, makoArgs=[]) { // We want to save a map for the Mako opts def makoOutput = "" // Echo the command // NOTE: We redirect stderr to stdout so that we can capture // both stderr and stdout streams with the 'returnStdout' flag // in sh command. def listMakoCmd = [ "python3", getMakoScript, "--device 0"].join(" ") if (makoArgs) { def makoOptArgs = makoArgs.collect { "--mako-opt " + it } listMakoCmd += " " + makoOptArgs.join(" ") } // Add the withCredentials step to access gpu-chip-mapping file withCredentials([file(credentialsId: 'gpu-chip-mapping', variable: 'GPU_CHIP_MAPPING')]) { listMakoCmd = [listMakoCmd, "--chip-mapping-file ${GPU_CHIP_MAPPING}"].join(" ") listMakoCmd = [listMakoCmd, "2>&1"].join(" ") echo "Scripts to get Mako list, cmd: ${listMakoCmd}" // Capture the mako output, add timeout in case any hang timeout(time: 30, unit: 'MINUTES'){ makoOutput = sh(label: "Capture Mako Parameters", script: listMakoCmd, returnStdout: true) } } // Validate output assert makoOutput: "Mako opts not found - could not construct test db test list." // Split each line of mako output into a list def outputList = makoOutput.split("\n") def makoOptsJson = transformMakoArgsToJson(outputList) return makoOptsJson } def parseMultiNodeTaskConfigFromStageName(String stageName) { def taskConfig = null def matcher = (stageName =~ /([^-]+)-(\d+)_GPUs(?:-(\d+)_Nodes)?/) if (matcher.find()) { taskConfig = [ gpu: "${matcher.group(1)}", system_gpu_count: "${matcher.group(2)}", node_count: matcher.group(3) ?: "1" // Default to 1 if _Nodes not present ] } return taskConfig } def getMakoArgsFromStageName(stageName, parseSysinfo=false) { def makoArgs = [] def isPostMerge = stageName.contains("Post-Merge") makoArgs += [isPostMerge ? "stage=post_merge" : "stage=pre_merge"] // Determine the backend type based on keywords in stageName if (stageName.contains("-PyTorch-")) { // If stageName contains "-PyTorch-", add "backend=pytorch" to makoArgs // At this point, only tests with backend=pytorch or unspecified backend will be run makoArgs += ["backend=pytorch"] } else if (stageName.contains("-TensorRT-")) { // If stageName contains "-TensorRT-", add "backend=tensorrt" to makoArgs // At this point, only tests with backend=tensorrt or unspecified backend will be run makoArgs += ["backend=tensorrt"] } else if (stageName.contains("-CPP-")) { // If stageName contains "-CPP-", add "backend=cpp" to makoArgs // At this point, only tests with backend=cpp or unspecified backend will be run makoArgs += ["backend=cpp"] } else if (stageName.contains("-Triton-")) { // If stageName contains "-Triton-", add "backend=triton" to makoArgs // At this point, only tests with backend=triton or unspecified backend will be run makoArgs += ["backend=triton"] } else if (stageName.contains("-FMHA-")) { // If stageName contains "-FMHA-", add "backend=fmha" to makoArgs // At this point, only tests with backend=fmha or unspecified backend will be run makoArgs += ["backend=fmha"] } else { // If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", "-Triton-", or "-FMHA-", do not add any backend // At this point, all tests will be run // For cases where backend is not specified in makoArgs, we will match all types of backends and tests without specified backend } if (stageName.contains("-DeepSeek-")) { makoArgs += ["auto_trigger=deepseek"] } else if (stageName.contains("-GptOss-")) { makoArgs += ["auto_trigger=gpt_oss"] } else { makoArgs += ["auto_trigger=others"] } if (stageName.contains("-Ray-")) { // If stageName contains "-Ray-", add "orchestrator=ray" to makoArgs // At this point, only tests with orchestrator=ray or unspecified orchestrator will be run. // Mark tests with orchestrator=mpi to exclude them from Ray stage. makoArgs += ["orchestrator=ray"] } else { // Otherwise select tests with orchestrator=mpi or unspecified orchestrator makoArgs += ["orchestrator=mpi"] } if (parseSysinfo) { def taskConfig = parseMultiNodeTaskConfigFromStageName(stageName) if (taskConfig) { makoArgs += [ "gpu=${taskConfig.gpu}", "system_gpu_count=${taskConfig.system_gpu_count}" ] } } return makoArgs } def renderTestDB(testContext, llmSrc, stageName, preDefinedMakoOpts=null) { def makoOpts = preDefinedMakoOpts if (!makoOpts) { def scriptPath = "${llmSrc}/tests/integration/defs/sysinfo/get_sysinfo.py" def makoArgs = getMakoArgsFromStageName(stageName) makoOpts = getMakoOpts(scriptPath, makoArgs) } sh "pip3 install --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/sw-tensorrt-pypi/simple --ignore-installed trt-test-db==1.8.5+bc6df7" def testDBPath = "${llmSrc}/tests/integration/test_lists/test-db" def testList = "${llmSrc}/${testContext}.txt" def testDBQueryCmd = [ "trt-test-db", "-d", testDBPath, "--context", testContext, "--test-names", "--output", testList, "--match", "'${makoOpts}'" ].join(" ") sh(label: "Render test list from test-db", script: testDBQueryCmd) sh(script: "cat ${testList}") return testList } def getSSHConnectionPorts(portConfigFile, stageName) { def type = stageName.split('-')[0] echo "The type is: ${type}" def fileContent = sh(script: "cat ${portConfigFile}", returnStdout: true).trim() // Get available VM port list from portConfigFile based on stage name (e.g. A10: [10022, 10023]) def portList = [] fileContent.split('\n').each { line -> def matcher = (line =~ /(.+?)=\[(.+?)\]/) if (matcher) { def key = matcher[0][1].replaceAll("\\s","") def values = matcher[0][2].replaceAll("\\s","").split(',').collect { it.replaceAll("\\s","") } if (key == type) { portList.addAll(values) } } } echo "Port List for ${type}: ${portList}" // Get current port usage status def portUsage = "" withCredentials([ usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'), string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME') ]) { portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'netstat -tuln'", returnStdout: true) } echo "Port Usage: ${portUsage}" // Get an available VM port def userPort = 0 while (portList.size() > 0) { def randomIndex = (int)(Math.random() * portList.size()) def curPort = portList[randomIndex].toInteger() if (!portUsage.contains(":${curPort}")) { userPort = curPort break } portList.remove(randomIndex) } if (userPort == 0) { echo "There is no available port for ${type}" return [0, 0] } echo "The chosen port is: ${userPort}" // Calculate autossh monitor port by subtracting 9000 from VM port (e.g. 10022 -> 1022) // If monitor port is already in use, randomly assign a value between 2000-3000 def monitorPort = userPort - 9000 while (portUsage.contains(":${monitorPort}")) { monitorPort = 2000 + (int)(Math.random() * 1000) } echo "The monitor port is: ${monitorPort}" return [userPort, monitorPort] } // Return true means the test rerun also fails. Return false otherwise. def rerunFailedTests(stageName, llmSrc, testCmdLine, resultFileName="results.xml", testType="regular") { if (!fileExists("${WORKSPACE}/${stageName}/${resultFileName}")) { echo "There is no ${resultFileName} file, skip the rerun step" return true } // Create rerun directory structure to avoid conflicts def rerunDir = "${WORKSPACE}/${stageName}/rerun/${testType}" sh "mkdir -p ${rerunDir}" // Generate rerun test lists def failSignaturesList = trtllm_utils.getFailSignaturesList().join(",") sh """ python3 ${llmSrc}/jenkins/scripts/test_rerun.py \ generate_rerun_tests_list \ --output-dir=${rerunDir}/ \ --input-file=${WORKSPACE}/${stageName}/${resultFileName} \ --fail-signatures='${failSignaturesList}' """ // If there are some failed tests that cannot be rerun (e.g. test duration > 10 min and no known failure signatures), // fail the stage immediately without attempting any reruns def rerunTestList = "${rerunDir}/rerun_0.txt" if (fileExists(rerunTestList)) { sh "cat ${rerunTestList}" echo "There are some failed tests that cannot be rerun, skip the rerun step." return true } // If the stage has more than 5 failed tests, skip the rerun step def validLineCount = 0 for (times in [1, 2]) { def currentRerunTestList = "${rerunDir}/rerun_${times}.txt" if (fileExists(currentRerunTestList)) { count = sh( script: "grep -v '^[[:space:]]*\$' ${currentRerunTestList} | wc -l", returnStdout: true ).trim().toInteger() echo "Found ${count} ${testType} tests to rerun ${times} time(s)" validLineCount += count } } if (validLineCount > 5) { echo "There are more than 5 failed ${testType} tests, skip the rerun step." return true } else if (validLineCount == 0) { echo "No failed ${testType} tests need to be rerun, skip the rerun step." return true } // Rerun tests def isRerunFailed = false for (times in [1, 2]) { def currentRerunTestList = "${rerunDir}/rerun_${times}.txt" if (!fileExists(currentRerunTestList)) { echo "No failed ${testType} tests need to be rerun ${times} time(s)" continue } sh "cat ${currentRerunTestList}" def xmlFile = "${rerunDir}/rerun_results_${times}.xml" // change the testCmdLine for rerun def noNeedLine = ["--splitting-algorithm", "--splits", "--group", "--cov"] def needToChangeLine = ["--test-list", "--csv", "--periodic-junit-xmlpath"] def newTestCmdLine = testCmdLine.findAll { cmd -> !noNeedLine.any { line -> cmd.contains(line) } && !needToChangeLine.any { line -> cmd.contains(line) } } newTestCmdLine += [ "--test-list=${currentRerunTestList}", "--csv=${rerunDir}/rerun_report_${times}.csv", "--periodic-junit-xmlpath ${xmlFile}", "--reruns ${times - 1}" ] try { sh """ cd ${llmSrc}/tests/integration/defs && \ ${newTestCmdLine.join(" ")} """ } catch(InterruptedException e) { throw e } catch (Exception e) { if (!fileExists(xmlFile)) { echo "The ${testType} tests crashed when rerun attempt." throw e } echo "The ${testType} tests still failed after rerun attempt." isRerunFailed = true } } echo "isRerunFailed for ${testType}: ${isRerunFailed}" return isRerunFailed } def generateRerunReport(stageName, llmSrc) { echo "Generating comprehensive rerun report for stage: ${stageName}" def rerunBaseDir = "${WORKSPACE}/${stageName}/rerun" def regularRerunDir = "${rerunBaseDir}/regular" // Check if regular rerun directory has rerun_results_*.xml files def hasRegularReruns = sh(script: "[ -d '${regularRerunDir}' ] && find '${regularRerunDir}' -name 'rerun_results_*.xml' | head -1 | grep -q . && echo 'true' || echo 'false'", returnStdout: true).trim() == 'true' // Check if any isolated rerun directories have rerun_results_*.xml files def hasIsolatedReruns = sh(script: "find ${rerunBaseDir} -type d -name 'isolated_*' -exec find {} -name 'rerun_results_*.xml' \\; 2>/dev/null | head -1 | grep -q . && echo 'true' || echo 'false'", returnStdout: true).trim() == 'true' // Find isolated tests that have actual rerun results and build mapping def isolatedTestsWithReruns = [] if (hasIsolatedReruns) { def isolatedDirsOutput = sh(script: "find ${rerunBaseDir} -type d -name 'isolated_*' 2>/dev/null || true", returnStdout: true).trim() if (isolatedDirsOutput) { def isolatedDirs = isolatedDirsOutput.split('\n').findAll { it.trim() } isolatedDirs.each { isolatedDir -> // Extract the isolated number from directory name (e.g., isolated_0 -> 0) def isolatedNum = isolatedDir.split('/').last().replace('isolated_', '') // Check if this isolated directory has any rerun results def hasRerunResults = sh(script: "find '${isolatedDir}' -name 'rerun_results_*.xml' | head -1 | grep -q . && echo 'true' || echo 'false'", returnStdout: true).trim() == 'true' if (hasRerunResults) { isolatedTestsWithReruns.add([ dir: isolatedDir, num: isolatedNum, originalResult: "${WORKSPACE}/${stageName}/results_isolated_${isolatedNum}.xml" ]) } } } } // Collect rerun result files and corresponding original result files def rerunResultFiles = [] echo "Found regular reruns: ${hasRegularReruns}" echo "Found isolated tests with reruns: ${isolatedTestsWithReruns.collect { "isolated_${it.num}" }}" if (!hasRegularReruns && !hasIsolatedReruns) { echo "No rerun results found, skipping rerun report generation" return } // Specify the stage name correctly for all result xml files. sh "cd ${WORKSPACE}/${stageName} && find . -name '*.xml' -exec sed -i 's/testsuite name=\"pytest\"/testsuite name=\"${stageName}\"/g' {} + || true" // Collect all original and rerun result files def allInputFiles = [] // Add original results if (fileExists("${WORKSPACE}/${stageName}/results.xml")) { allInputFiles.add("${WORKSPACE}/${stageName}/results.xml") // Add to rerunResultFiles only if it has reruns if (hasRegularReruns) { rerunResultFiles.add("${WORKSPACE}/${stageName}/results.xml") } } // Add ALL isolated test results to allInputFiles def isolatedResults = sh(script: "find ${WORKSPACE}/${stageName} -name 'results_isolated_*.xml' 2>/dev/null || true", returnStdout: true).trim() if (isolatedResults) { isolatedResults.split('\n').each { file -> if (file.trim()) { allInputFiles.add(file.trim()) } } // Add isolated test results that have reruns to rerunResultFiles and add their rerun results to allInputFiles isolatedTestsWithReruns.each { isolatedTest -> if (fileExists(isolatedTest.originalResult)) { rerunResultFiles.add(isolatedTest.originalResult) echo "Added isolated result with reruns to rerunResultFiles: ${isolatedTest.originalResult}" } for (times in [1, 2]) { def rerunFile = "${isolatedTest.dir}/rerun_results_${times}.xml" if (fileExists(rerunFile)) { allInputFiles.add(rerunFile) rerunResultFiles.add(rerunFile) } } } } // Add regular rerun results if (hasRegularReruns) { for (times in [1, 2]) { def rerunFile = "${regularRerunDir}/rerun_results_${times}.xml" if (fileExists(rerunFile)) { allInputFiles.add(rerunFile) rerunResultFiles.add(rerunFile) } } } if (allInputFiles.isEmpty()) { echo "No valid input files found for rerun report generation" return } echo "Generating rerun report with input files: ${rerunResultFiles.join(',')}" // Generate comprehensive rerun report sh """ python3 ${llmSrc}/jenkins/scripts/test_rerun.py \ generate_rerun_report \ --output-file=${WORKSPACE}/${stageName}/rerun_results.xml \ --input-files=${rerunResultFiles.join(",")} """ // Update original results xml file with all rerun results for junit sh """ python3 ${llmSrc}/jenkins/scripts/test_rerun.py \ merge_junit_xmls \ --output-file=${WORKSPACE}/${stageName}/results.xml \ --input-files=${allInputFiles.join(",")} \ --deduplicate """ // Upload rerun report if (fileExists("${WORKSPACE}/${stageName}/rerun_results.html")) { trtllm_utils.uploadArtifacts( "${WORKSPACE}/${stageName}/rerun_results.html", "${UPLOAD_PATH}/rerun_reports/${stageName}_rerun_results.html" ) echo "Test rerun report: https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/rerun_reports/${stageName}_rerun_results.html" } // Remove isolation results since they are merged into results.xml sh "rm -rf ${WORKSPACE}/${stageName}/results_isolated_*.xml || true" echo "Rerun report generation completed for stage: ${stageName}" } def mergeWaivesTxt(pipeline, llmSrc, stageName) { def waivesTxt = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/waive_list/waives.txt" try { trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget -nv ${waivesTxt}") if (!fileExists("waives.txt")) { error "There is no merged waives.txt file, use the default waives.txt." } sh "rm ${llmSrc}/tests/integration/test_lists/waives.txt" sh "mv waives.txt ${llmSrc}/tests/integration/test_lists/waives.txt" echo "Download merged waives.txt successfully" } catch (InterruptedException e) { throw e } catch (Exception e) { echo "Failed to download merged waives.txt, use the default waives.txt. Error: ${e.message}" } } def reusePassedTestResults(llmSrc, stageName, waivesTxt) { try { // Get passed test list from open search def passedTestListFile = "${WORKSPACE}/${stageName}/passed_test_list.txt" sh """ python3 ${llmSrc}/jenkins/scripts/open_search_query.py \ --commit-id ${env.gitlabCommit} \ --stage-name ${stageName} \ --output-file ${passedTestListFile} """ def passedTestList = readFile(file: passedTestListFile).readLines() def reusedTests = passedTestList.collect { test -> test.trim() } // Append reused tests to waives.txt if (reusedTests.size() > 0) { // Build the content to append def reusedTestsContent = reusedTests.collect { test -> "${test} SKIP (Reused from previous pipeline)" }.join('\n') echo "Reused tests:\n${reusedTestsContent}" sh(label: "Append Reused Tests", script: """ cat >> ${waivesTxt} << 'REUSED_TESTS_EOF' ${reusedTestsContent} REUSED_TESTS_EOF """) echo "Appended ${reusedTests.size()} reused tests to ${waivesTxt}" } else { echo "No reused tests found" } } catch (InterruptedException e) { throw e } catch (Exception e) { echo "Failed to add passed test list from previous pipeline run to the waives.txt. Error: ${e.message}" } } def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312") { // Step 1: create LLM_ROOT dir and clean up the workspace def llmRootConfig = "${LLM_ROOT}${config}" sh """ env | sort pwd && ls -alh rm -rf ./* mkdir ${llmRootConfig} ls -alh ${env.WORKSPACE} ls -alh ${env.WORKSPACE_TMP} """ def llmPath = sh (script: "realpath ${llmRootConfig}", returnStdout: true).trim() def llmSrc = "${llmPath}/TensorRT-LLM/src" echoNodeAndGpuInfo(pipeline, stageName) if (env.alternativeTRT && cpver) { stage("Replace TensorRT") { trtllm_utils.replaceWithAlternativeTRT(env.alternativeTRT, cpver) } } // Step 2: run tests stage ("Setup Environment") { // Random sleep to avoid resource contention sleep(10 * Math.random()) sh "curl ifconfig.me || true" sh "nproc && free -g && hostname" echoNodeAndGpuInfo(pipeline, stageName) sh "cat ${MODEL_CACHE_DIR}/README" sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m" sh "df -h" // setup HF_HOME to cache model and datasets // init the huggingface cache from nfs, since the nfs is read-only, and HF_HOME needs to be writable, otherwise it will fail at creating file lock sh "mkdir -p ${HF_HOME} && ls -alh ${HF_HOME}" trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y rsync") trtllm_utils.llmExecStepWithRetry(pipeline, script: "rsync -r ${MODEL_CACHE_DIR}/hugging-face-cache/ ${HF_HOME}/ && ls -lh ${HF_HOME}") sh "df -h" // install package sh "env | sort" sh "which python3" sh "python3 --version" trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get install -y libffi-dev") sh "rm -rf results-${stageName}.tar.gz ${stageName}/*" // download TRT-LLM tarfile def tarName = BUILD_CONFIGS[config][TARNAME] def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}" trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}") sh "cd ${llmPath} && tar -zxf ${tarName}" // install python package if (env.alternativeTRT) { sh "cd ${llmSrc} && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt" } trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && pip3 install -r requirements-dev.txt") if (stageName.contains("-Ray-")) { trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install ray[default]") } if (!skipInstallWheel) { trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl") } trtllm_utils.llmExecStepWithRetry(pipeline, script: "mkdir -p /opt/tritonserver/backends/tensorrtllm") def isAarch64 = config.contains("aarch64") if (!isAarch64) { trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && cp TensorRT-LLM/triton_backend/inflight_batcher_llm/libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm/") trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && cp TensorRT-LLM/triton_backend/inflight_batcher_llm/trtllmExecutorWorker /opt/tritonserver/backends/tensorrtllm/") } trtllm_utils.llmExecStepWithRetry(pipeline, script: "git config --global --add safe.directory \"*\"") } if (testFilter[(DEBUG_MODE)]) { stage("Interactive Debug Session") { testFilter[(DEBUG_MODE)] = false trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get install openssh-server -y") trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get install autossh -y") trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get install sshpass -y") sh """ echo 'Port 22' >> /etc/ssh/sshd_config echo 'PermitRootLogin yes' >> /etc/ssh/sshd_config echo 'PasswordAuthentication yes' >> /etc/ssh/sshd_config echo 'PubkeyAuthentication yes' >> /etc/ssh/sshd_config echo 'AllowTcpForwarding yes' >> /etc/ssh/sshd_config echo 'GatewayPorts yes' >> /etc/ssh/sshd_config cat /etc/ssh/sshd_config """ sh "service ssh restart" sh "service ssh status" sh "ssh-keygen -t rsa -b 2048 -f ~/.ssh/id_rsa -N '' -q" sh """ chmod 700 ~/.ssh chmod 400 ~/.ssh/id_rsa touch ~/.ssh/authorized_keys chmod 600 ~/.ssh/authorized_keys """ // Write env variables to a file sh 'env | sort | sed -E \'s/^([^=]+)=(.*)$/export \\1="\\2"/\' > debug_env.sh' sh "cat debug_env.sh" // The portConfig file is in the VM def portConfigFilePath = "/root/.ssh/ports_config.txt" withCredentials([ usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'), string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME') ]) { sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub" sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys" sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}" def (int userPort, int monitorPort) = getSSHConnectionPorts(portConfigFilePath, stageName) if (userPort == 0) { echo "Fail to setup an interactive debug session and exit the debug mode." testFilter[(DEBUG_MODE)] = false return } sh "ssh -f ${COMMON_SSH_OPTIONS} -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}" sh "autossh -fNR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}" sh "ps aux | grep ssh" try { timeout(time: 2, unit: 'HOURS') { input message: "Pause 2 hours for Pre-Debug. Please type 'ssh root@${HOST_NAME} -p ${userPort}' on the CLI to create the connection. Please press the button to proceed when you finish debugging." } } catch (InterruptedException e) { echo "Pre-debug session was interrupted by user or timeout" currentBuild.result = 'ABORTED' error("Pipeline aborted during pre-debug session") } catch (Exception e) { echo "An error occurred during pre-debug session: ${e.message}" currentBuild.result = 'FAILURE' error("Error in pre-debug session: ${e.message}") } } testFilter[(DEBUG_MODE)] = true } } stage ("[${stageName}] Run Pytest") { def noRegularTests = false def noIsolateTests = false def rerunFailed = false def testDBList = renderTestDB(testList, llmSrc, stageName) // Download and Merge waives.txt mergeWaivesTxt(pipeline, llmSrc, stageName) // Add passed test list from previous pipeline run to the waives.txt if (testFilter[(REUSE_TEST)] != false) { reusePassedTestResults(llmSrc, stageName, "${llmSrc}/tests/integration/test_lists/waives.txt") } // Process shard test list and create separate files for regular and isolate tests def preprocessedLists = processShardTestList(llmSrc, testDBList, splitId, splits, perfMode) // Test Coverage def TRTLLM_WHL_PATH = sh(returnStdout: true, script: "pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2").replaceAll("\\s","") sh "echo ${TRTLLM_WHL_PATH}" def coverageConfigFile = "${llmSrc}/${stageName}/.coveragerc" sh "mkdir -p ${llmSrc}/${stageName} && touch ${coverageConfigFile}" sh """ echo '[run]' > ${coverageConfigFile} echo 'branch = True' >> ${coverageConfigFile} echo 'data_file = ${WORKSPACE}/${stageName}/.coverage.${stageName}' >> ${coverageConfigFile} echo '[paths]' >> ${coverageConfigFile} echo 'source =\n ${llmSrc}/tensorrt_llm/\n ${TRTLLM_WHL_PATH}/tensorrt_llm/' >> ${coverageConfigFile} cat ${coverageConfigFile} """ echoNodeAndGpuInfo(pipeline, stageName) // Allocate a unique port section for this container to avoid port conflicts def hostNodeName = getHostNodeName() def containerPortStart = getStartingPortForHost(hostNodeName, stageName) def containerPortNum = GlobalState.PORT_SECTION_SIZE // Some clusters do not allow dmesg -C so we add || true // Temporarily disable to reduce the log size // sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi' def pytestCommand = getPytestBaseCommandLine( llmSrc, stageName, "${llmSrc}/tests/integration/test_lists/waives.txt", perfMode, "${WORKSPACE}/${stageName}", TRTLLM_WHL_PATH, coverageConfigFile, "", // pytestUtil [], // extraArgs containerPortStart, containerPortNum ) // Only add --test-list if there are regular tests to run if (preprocessedLists.regularCount > 0) { pytestCommand += ["--test-list=${preprocessedLists.regular}"] } def containerPIP_LLM_LIB_PATH = sh(script: "pip3 show tensorrt_llm | grep \"Location\" | awk -F\":\" '{ gsub(/ /, \"\", \$2); print \$2\"/tensorrt_llm/libs\"}'", returnStdout: true).replaceAll("\\s","") def containerLD_LIBRARY_PATH = sh(script: "echo \${LD_LIBRARY_PATH}", returnStdout: true).replaceAll("\\s","") if (!containerLD_LIBRARY_PATH.contains("${containerPIP_LLM_LIB_PATH}:")) { echo "Prepend ${containerPIP_LLM_LIB_PATH} into \${LD_LIBRARY_PATH}" containerLD_LIBRARY_PATH = "${containerPIP_LLM_LIB_PATH}:${containerLD_LIBRARY_PATH}" } containerLD_LIBRARY_PATH = containerLD_LIBRARY_PATH.replaceAll(':+$', '') withEnv(["LD_LIBRARY_PATH=${containerLD_LIBRARY_PATH}"]) { withCredentials([ usernamePassword( credentialsId: 'svc_tensorrt_gitlab_read_api_token', usernameVariable: 'GITLAB_API_USER', passwordVariable: 'GITLAB_API_TOKEN' ), string(credentialsId: 'llm_evaltool_repo_url', variable: 'EVALTOOL_REPO_URL') ]) { sh "env | sort" try { if (preprocessedLists.regularCount > 0) { sh """ rm -rf ${stageName}/ && \ cd ${llmSrc}/tests/integration/defs && \ ${pytestCommand.join(" ")} """ } else { echo "No regular tests to run for stage ${stageName}" noRegularTests = true sh "mkdir -p ${stageName}" // Create an empty results.xml file for consistency sh """ echo '' > ${stageName}/results.xml echo '' >> ${stageName}/results.xml echo '' >> ${stageName}/results.xml echo '' >> ${stageName}/results.xml echo '' >> ${stageName}/results.xml """ } } catch (InterruptedException e) { throw e } catch (Exception e) { def isRerunFailed = rerunFailedTests(stageName, llmSrc, pytestCommand, "results.xml", "regular") if (isRerunFailed) { catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { error "Regular tests failed after rerun attempt" } rerunFailed = true } } // Run the isolated tests if exists if (preprocessedLists.isolateCount > 0) { stage ("[${stageName}] Run Pytest (Isolated)") { echo "There are ${preprocessedLists.isolateCount} isolated tests to run" rerunFailed = runIsolatedTests(preprocessedLists, pytestCommand, llmSrc, stageName) || rerunFailed } } else { echo "No isolated tests to run for stage ${stageName}" noIsolateTests = true } if (noRegularTests && noIsolateTests) { error "No tests were executed for stage ${stageName}, please check the test list and test-db rendering result." } } } // Generate comprehensive rerun report if any reruns occurred stage ("Generate Report") { generateRerunReport(stageName, llmSrc) } if (rerunFailed) { error "Some tests still failed after rerun attempts, please check the test report." } if (perfMode) { basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv" basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}" stage("Check Perf Result") { def perfCheckResult = sh( script: """ python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \ ${stageName}/perf_script_test_results.csv \ ${basePerfPath} """, returnStatus: true ) if (perfCheckResult != 0) { error "Performance regression detected and failing the build (exit code: ${perfCheckResult})" } } stage("Create Perf Report") { sh """ python3 ${llmSrc}/tests/integration/defs/perf/create_perf_comparison_report.py \ --output_path ${stageName}/report.pdf \ --files ${stageName}/perf_script_test_results.csv \ ${basePerfPath} """ } } } } def runLLMTestlistOnPlatform(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312", postTag="") { cacheErrorAndUploadResult(stageName, { runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver) }, { if (testFilter[(DEBUG_MODE)]) { try { timeout(time: 2, unit: 'HOURS') { input message: "Pause 2 hours for Post-Debug. Please press the button to proceed when you finish debugging." } } catch (InterruptedException e) { echo "Post-debug session was interrupted by user or timeout" currentBuild.result = 'ABORTED' error("Pipeline aborted during post-debug session") } catch (Exception e) { echo "An error occurred during post-debug session: ${e.message}" currentBuild.result = 'FAILURE' error("Error in post-debug session: ${e.message}") } } // If the execution test list is null, remove the test result xml sh """ ls -all ${stageName}/ if ! grep -q ' node(label) { try { if (needToDeleteDir) { deleteDir() } stage('Pull Docker Image') { docker.image(image).pull() } // We submit the Slurm job with the Slurm partition's time spec. // Minus 10 minutes to avoid the Slurm job being stopped earlier. timeout(time: partitionTimeout - 10, unit: 'MINUTES') { docker.image(image).inside(dockerArgs) { runner() } } } catch (Exception e) { if (e.getMessage()?.contains("Failed to kill container")) { echo "Known benign error ignored: ${e.getMessage()}" } else { throw e // Re-throw if it's a different Exception } } } } } def runInEnrootOnNode(label, partitionTimeout) { return { runner -> node(label) { // We submit the Slurm job with the Slurm partition's time spec. // Minus 10 minutes to avoid the Slurm job being stopped earlier. timeout(time: partitionTimeout - 10, unit: 'MINUTES') { runner() } } } } def runInKubernetes(pipeline, podSpec, containerName) { return { stageName, runner -> stage(stageName) { trtllm_utils.launchKubernetesPod(pipeline, podSpec, containerName) { echoNodeAndGpuInfo(pipeline, stageName) runner() } } } } def launchTestJobs(pipeline, testFilter) { // IMPORTANT: Stage Configuration Syntax Requirement // // The test_to_stage_mapping.py script expects stage definitions in the following format: // "Stage-Name": ["platform", "yaml_file", splitId, split_count, gpu_count] // // Where: // - Stage-Name: Must be quoted string, used to identify the Jenkins stage // - platform: Hardware platform identifier (e.g., "a10", "h100-cr") // - yaml_file: Test database YAML filename without .yml extension (e.g., "l0_a10") // - splitId: Current split number (1-based) // - split_count: Total number of splits // - gpu_count: Number of GPUs required (optional, defaults to 1) // // This format is parsed by scripts/test_to_stage_mapping.py to provide bidirectional // mapping between test names and Jenkins stage names. Any changes to this syntax // may break the mapping functionality. x86TestConfigs = [ "DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], "A10-PyTorch-1": ["a10", "l0_a10", 1, 2], "A10-PyTorch-2": ["a10", "l0_a10", 2, 2], "A10-CPP-1": ["a10", "l0_a10", 1, 1], "A10-TensorRT-1": ["a10", "l0_a10", 1, 5], "A10-TensorRT-2": ["a10", "l0_a10", 2, 5], "A10-TensorRT-3": ["a10", "l0_a10", 3, 5], "A10-TensorRT-4": ["a10", "l0_a10", 4, 5], "A10-TensorRT-5": ["a10", "l0_a10", 5, 5], "A10-Pybind": ["a10", "l0_a10_pybind", 1, 1], "A30-Triton-1": ["a30", "l0_a30", 1, 1], "A30-PyTorch-1": ["a30", "l0_a30", 1, 2], "A30-PyTorch-2": ["a30", "l0_a30", 2, 2], "A30-CPP-1": ["a30", "l0_a30", 1, 3], "A30-CPP-2": ["a30", "l0_a30", 2, 3], "A30-CPP-3": ["a30", "l0_a30", 3, 3], "A100X-PyTorch-1": ["a100x", "l0_a100", 1, 1], "L40S-PyTorch-1": ["l40s", "l0_l40s", 1, 2], "L40S-PyTorch-2": ["l40s", "l0_l40s", 2, 2], "H100_PCIe-PyTorch-1": ["h100-cr", "l0_h100", 1, 4], "H100_PCIe-PyTorch-2": ["h100-cr", "l0_h100", 2, 4], "H100_PCIe-PyTorch-3": ["h100-cr", "l0_h100", 3, 4], "H100_PCIe-PyTorch-4": ["h100-cr", "l0_h100", 4, 4], "H100_PCIe-PyTorch-Ray-1": ["h100-cr", "l0_h100", 1, 1], "H100_PCIe-CPP-1": ["h100-cr", "l0_h100", 1, 1], "H100_PCIe-TensorRT-1": ["h100-cr", "l0_h100", 1, 1], "B200_PCIe-PyTorch-1": ["b100-ts2", "l0_b200", 1, 3], "B200_PCIe-PyTorch-2": ["b100-ts2", "l0_b200", 2, 3], "B200_PCIe-PyTorch-3": ["b100-ts2", "l0_b200", 3, 3], "RTX5090-PyTorch-1": ["rtx-5090", "l0_gb202", 1, 1], "RTX5080-TensorRT-1": ["rtx-5080", "l0_gb203", 1, 2], "RTX5080-TensorRT-2": ["rtx-5080", "l0_gb203", 2, 2], // Currently post-merge test stages only run tests with "stage: post_merge" mako // in the test-db. This behavior may change in the future. "A10-PyTorch-Post-Merge-1": ["a10", "l0_a10", 1, 1], // "A10-TensorRT-Post-Merge-1": ["a10", "l0_a10", 1, 2], // "A10-TensorRT-Post-Merge-2": ["a10", "l0_a10", 2, 2], "A10-FMHA-Post-Merge-1": ["a10", "l0_a10", 1, 1], // "A30-TensorRT-Post-Merge-1": ["a30", "l0_a30", 1, 6], // "A30-TensorRT-Post-Merge-2": ["a30", "l0_a30", 2, 6], // "A30-TensorRT-Post-Merge-3": ["a30", "l0_a30", 3, 6], // "A30-TensorRT-Post-Merge-4": ["a30", "l0_a30", 4, 6], // "A30-TensorRT-Post-Merge-5": ["a30", "l0_a30", 5, 6], // "A30-TensorRT-Post-Merge-6": ["a30", "l0_a30", 6, 6], "A30-CPP-Post-Merge-1": ["a30", "l0_a30", 1, 1], "A30-Triton-Post-Merge-1": ["a30", "l0_a30", 1, 2], "A30-Triton-Post-Merge-2": ["a30", "l0_a30", 2, 2], // "A100X-TensorRT-Post-Merge-1": ["a100x", "l0_a100", 1, 6], // "A100X-TensorRT-Post-Merge-2": ["a100x", "l0_a100", 2, 6], // "A100X-TensorRT-Post-Merge-3": ["a100x", "l0_a100", 3, 6], // "A100X-TensorRT-Post-Merge-4": ["a100x", "l0_a100", 4, 6], // "A100X-TensorRT-Post-Merge-5": ["a100x", "l0_a100", 5, 6], // "A100X-TensorRT-Post-Merge-6": ["a100x", "l0_a100", 6, 6], "A100X-Triton-Post-Merge-1": ["a100x", "l0_a100", 1, 2], "A100X-Triton-Post-Merge-2": ["a100x", "l0_a100", 2, 2], "A100X-FMHA-Post-Merge-1": ["a100x", "l0_a100", 1, 1], // "L40S-TensorRT-Post-Merge-1": ["l40s", "l0_l40s", 1, 5], // "L40S-TensorRT-Post-Merge-2": ["l40s", "l0_l40s", 2, 5], // "L40S-TensorRT-Post-Merge-3": ["l40s", "l0_l40s", 3, 5], // "L40S-TensorRT-Post-Merge-4": ["l40s", "l0_l40s", 4, 5], // "L40S-TensorRT-Post-Merge-5": ["l40s", "l0_l40s", 5, 5], "L40S-FMHA-Post-Merge-1": ["l40s", "l0_l40s", 1, 1], "H100_PCIe-PyTorch-Post-Merge-1": ["h100-cr", "l0_h100", 1, 2], "H100_PCIe-PyTorch-Post-Merge-2": ["h100-cr", "l0_h100", 2, 2], "H100_PCIe-CPP-Post-Merge-1": ["h100-cr", "l0_h100", 1, 1], // "H100_PCIe-TensorRT-Post-Merge-1": ["h100-cr", "l0_h100", 1, 5], // "H100_PCIe-TensorRT-Post-Merge-2": ["h100-cr", "l0_h100", 2, 5], // "H100_PCIe-TensorRT-Post-Merge-3": ["h100-cr", "l0_h100", 3, 5], // "H100_PCIe-TensorRT-Post-Merge-4": ["h100-cr", "l0_h100", 4, 5], // "H100_PCIe-TensorRT-Post-Merge-5": ["h100-cr", "l0_h100", 5, 5], "H100_PCIe-FMHA-Post-Merge-1": ["h100-cr", "l0_h100", 1, 1], "B200_PCIe-Triton-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1], "B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2], "B200_PCIe-PyTorch-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2], // "B200_PCIe-TensorRT-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2], // "B200_PCIe-TensorRT-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2], "H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1], "H100_PCIe-PyTorch-Perf-1": ["h100-cr", "l0_perf", 1, 1], "DGX_H200-4_GPUs-Triton-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4], "DGX_H200-8_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8], "DGX_H200-4_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4], // "DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4], // "DGX_H200-4_GPUs-TensorRT-Post-Merge-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 3, 4], // "DGX_H200-4_GPUs-TensorRT-Post-Merge-3": ["dgx-h200-x4", "l0_dgx_h200", 3, 3, 4], // Disable RTXPro6000 stages due to nodes will be offline temporarily. // [TODO] Split tests between RTXPro6000 and RTXPro6000D and move reasonable mount of tests to pre-merge. // "RTXPro6000-PyTorch-Post-Merge-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1], // "RTXPro6000-4_GPUs-PyTorch-Post-Merge-1": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 1, 2, 4], // "RTXPro6000-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 2, 2, 4], "RTXPro6000D-PyTorch-1": ["rtx-pro-6000d", "l0_rtx_pro_6000", 1, 2], "RTXPro6000D-PyTorch-2": ["rtx-pro-6000d", "l0_rtx_pro_6000", 2, 2], "RTXPro6000D-4_GPUs-PyTorch-Post-Merge-1": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 1, 2, 4], "RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 2, 2, 4], ] parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("-Perf-")), { def config = VANILLA_CONFIG if (key.contains("single-device")) { config = SINGLE_DEVICE_CONFIG } if (key.contains("llvm")) { config = LLVM_CONFIG } if (key.contains("Pybind")) { config = PYBIND_CONFIG } runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3]) }]]} fullSet = parallelJobs.keySet() x86SlurmTestConfigs = [ "DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 2, 2], "DGX_H100-2_GPUs-PyTorch-Others-2": ["dgx-h100-x2-oci", "l0_dgx_h100", 2, 2, 2], "DGX_H100-2_GPUs-PyTorch-GptOss-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2], "DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2], "DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 2, 4], "DGX_H100-4_GPUs-PyTorch-DeepSeek-2": ["dgx-h100-x4-oci", "l0_dgx_h100", 2, 2, 4], "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "DGX_B200-4_GPUs-PyTorch-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true], "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true], "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true], "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 2, 4, 1, true], "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true], "B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1], "DGX_B300-4_GPUs-PyTorch-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4], "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4], "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4], // PerfSanity post-merge tests // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4], // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4], // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4], // "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 3, 8], // "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x8", "l0_dgx_b200_perf_sanity", 2, 3, 8], // "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x8", "l0_dgx_b200_perf_sanity", 3, 3, 8], ] fullSet += x86SlurmTestConfigs.keySet() parallelSlurmJobs = x86SlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "amd64"), { def config = VANILLA_CONFIG if (key.contains("single-device")) { config = SINGLE_DEVICE_CONFIG } if (key.contains("llvm")) { config = LLVM_CONFIG } runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false) }]]} parallelJobs += parallelSlurmJobs // SBSA machines from the Blossom machine pool SBSATestConfigs = [ "GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1], // DGX Spark is also named as GB10 Grace Blackwell Superchip. "GB10-PyTorch-1": ["gb10x", "l0_gb10", 1, 1], ] fullSet += SBSATestConfigs.keySet() SBSASlurmTestConfigs = [ "GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4], "GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4], "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4], "GB10-PyTorch-Post-Merge-1": ["gb10x-single", "l0_gb10", 1, 1], // Disable GB300 stages due to nodes will be offline temporarily. // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1], // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4], // PerfSanity pre-merge tests "GB200-4_GPUs-PyTorch-PerfSanity-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 2, 4], "GB200-4_GPUs-PyTorch-PerfSanity-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 2, 4], // PerfSanity post-merge tests "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 3, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 3, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 3, 4], ] fullSet += SBSASlurmTestConfigs.keySet() multiNodesSBSAConfigs = [ // Each testcase uses 8 GPUs and 2 nodes. // https://nvbugs/5598863 (uncorrectable NVLink error detected during the execution) may not exist in OCI machines. "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 2, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 2, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2], // PerfSanity post-merge tests "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2], "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3], // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6], // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6], // "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8], ] fullSet += multiNodesSBSAConfigs.keySet() if (env.targetArch == AARCH64_TRIPLE) { parallelJobs = SBSATestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), { runLLMTestlistOnPlatform(pipeline, values[0], values[1], LINUX_AARCH64_CONFIG, false, key, values[2], values[3]) }]]} // Add SBSA Slurm jobs parallelSlurmJobs = SBSASlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), { def config = LINUX_AARCH64_CONFIG if (key.contains("single-device")) { config = SINGLE_DEVICE_CONFIG } if (key.contains("llvm")) { config = LLVM_CONFIG } runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false) }]]} parallelJobs += parallelSlurmJobs // Add SBSA multi node Slurm jobs parallelMultiNodesSBSAJobs = multiNodesSBSAConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), { def config = LINUX_AARCH64_CONFIG if (key.contains("single-device")) { config = SINGLE_DEVICE_CONFIG } if (key.contains("llvm")) { config = LLVM_CONFIG } runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false) }]]} parallelJobs += parallelMultiNodesSBSAJobs } docBuildSpec = createKubernetesPodConfig(LLM_DOCKER_IMAGE, "a10") docBuildConfigs = [ "A10-Build_Docs": [docBuildSpec, { sh "rm -rf **/*.xml *.tar.gz" runLLMDocBuild(pipeline, config=VANILLA_CONFIG) }], ] fullSet += docBuildConfigs.keySet() if (env.targetArch == AARCH64_TRIPLE) { docBuildConfigs = [:] } docBuildJobs = docBuildConfigs.collectEntries{key, values -> [key, [values[0], { stage("[${key}] Run") { cacheErrorAndUploadResult("${key}", values[1], {}, true) } }]]} // Python version and OS for sanity check x86SanityCheckConfigs = [ "PY312-DLFW": [ LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE, "B200_PCIe", X86_64_TRIPLE, false, "dlfw/", DLFW_IMAGE, false, ], "PY310-UB2204": [ LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE, "A10", X86_64_TRIPLE, true, "", UBUNTU_22_04_IMAGE, true, // Extra install PyTorch CUDA 13.0 package to align with the CUDA version used for building TensorRT LLM wheels. ], "PY312-UB2404": [ LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE, "RTX5090", X86_64_TRIPLE, true, "", UBUNTU_24_04_IMAGE, true, // Extra PyTorch CUDA 13.0 install ], ] aarch64SanityCheckConfigs = [ "PY312-UB2404": [ LLM_DOCKER_IMAGE, "GH200", AARCH64_TRIPLE, false, "", UBUNTU_24_04_IMAGE, true, // Extra PyTorch CUDA 13.0 install ], "PY312-DLFW": [ LLM_DOCKER_IMAGE, "GH200", AARCH64_TRIPLE, false, "dlfw/", DLFW_IMAGE, false, ], ] def toStageName = { gpuType, key -> "${gpuType}-PackageSanityCheck-${key}".toString() } fullSet += x86SanityCheckConfigs.collectEntries{ key, values -> [toStageName(values[1], key), null] }.keySet() fullSet += aarch64SanityCheckConfigs.collectEntries{ key, values -> [toStageName(values[1], key), null] }.keySet() sanityCheckConfigs = x86SanityCheckConfigs if (env.targetArch == AARCH64_TRIPLE) { sanityCheckConfigs = aarch64SanityCheckConfigs } sanityCheckJobs = sanityCheckConfigs.collectEntries {key, values -> [toStageName(values[1], key), { cacheErrorAndUploadResult(toStageName(values[1], key), { def cpu_arch = values[2] def gpu_type = values[1].toLowerCase() if (values[1] == "B200_PCIe") { gpu_type = "b100-ts2" } if (values[1] == "RTX5090") { gpu_type = "rtx-5090" } def k8s_arch = "amd64" if (cpu_arch == AARCH64_TRIPLE) { k8s_arch = "arm64" } def buildSpec = createKubernetesPodConfig(values[0], "build", k8s_arch) def buildRunner = runInKubernetes(pipeline, buildSpec, "trt-llm") def sanityRunner = null def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch) sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm") def wheelPath = "${values[4]}" def wheelName = "" def cpver = "cp312" def pyver = "3.12" if (key.contains("PY310")) { cpver = "cp310" pyver = "3.10" } buildRunner("[${toStageName(values[1], key)}] Build") { def env = [] if (key.contains("manylinux")) { env = ["LD_LIBRARY_PATH+=:/usr/local/cuda/compat"] } withEnv(env) { wheelName = runLLMBuild(pipeline, cpu_arch, values[3], wheelPath, cpver) } } def fullWheelPath = "${cpu_arch}/${wheelPath}${wheelName}" // TODO: Re-enable the sanity check after updating GPU testers' driver version. // sanityRunner("Sanity check") { // runPackageSanityCheck(pipeline, fullWheelPath, values[3], cpver) // } def checkPipStage = false if (cpu_arch == X86_64_TRIPLE) { checkPipStage = true } else if (cpu_arch == AARCH64_TRIPLE) { checkPipStage = true } if (checkPipStage) { stage("Run LLMAPI Test") { pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch) trtllm_utils.launchKubernetesPod(pipeline, pipInstallSanitySpec, "trt-llm", { echo "###### Prerequisites Start ######" echoNodeAndGpuInfo(pipeline, toStageName(values[1], key)) // Clean up the pip constraint file from the base NGC PyTorch image. if (values[5] == DLFW_IMAGE) { trtllm_utils.llmExecStepWithRetry(pipeline, script: "[ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true") } trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y python3-pip git rsync curl wget") trtllm_utils.checkoutSource(LLM_REPO, env.gitlabCommit, LLM_ROOT, true, true) trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 config set global.break-system-packages true") trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install requests") trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 uninstall -y tensorrt") if (values[5] != DLFW_IMAGE) { def ubuntu_version = key.contains("UB2404") ? "ubuntu2404" : "ubuntu2204" def platform = cpu_arch == X86_64_TRIPLE ? "x86_64" : "sbsa" trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget https://developer.download.nvidia.com/compute/cuda/repos/${ubuntu_version}/${platform}/cuda-keyring_1.1-1_all.deb") trtllm_utils.llmExecStepWithRetry(pipeline, script: "dpkg -i cuda-keyring_1.1-1_all.deb") trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-0") } // Extra PyTorch CUDA 13.0 install for all bare-metal environments (Default PyTorch is for CUDA 12.8) if (values[6]) { echo "###### Extra PyTorch CUDA 13.0 install Start ######" // Use internal mirror instead of https://download.pytorch.org/whl/cu130 for better network stability. // PyTorch CUDA 13.0 package and torchvision package can be installed as expected. if (k8s_arch == "amd64") { trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0+cu130 torchvision==0.24.0+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple") } else { trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0+cu130 torchvision==0.24.0 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple") } } def libEnv = [] if (env.alternativeTRT) { stage("Replace TensorRT") { trtllm_utils.replaceWithAlternativeTRT(env.alternativeTRT, cpver) } libEnv += ["LD_LIBRARY_PATH+tensorrt=/usr/local/tensorrt/lib"] libEnv += ["LD_LIBRARY_PATH+nvrtc=/usr/local/lib/python${pyver}/dist-packages/nvidia/cuda_nvrtc/lib"] } echo "###### Check pip install Start ######" withEnv(libEnv) { // Retry 2 times if timeout occurs. sh "env | sort" trtllm_utils.llmRetry(1, "checkPipInstall", { timeout(time: 30, unit: 'MINUTES') { checkPipInstall(pipeline, "${cpu_arch}/${wheelPath}") } }) } echo "###### Run LLMAPI tests Start ######" def config = VANILLA_CONFIG if (cpu_arch == AARCH64_TRIPLE) { config = LINUX_AARCH64_CONFIG } withEnv(libEnv) { sh "env | sort" runLLMTestlistOnPlatform(pipeline, gpu_type, "l0_sanity_check", config, false, toStageName(values[1], key), 1, 1, true, null, "-SubJob-RunTest") } }) } } }, {}, true) }]} multiGpuJobs = parallelJobs.findAll{(it.key.contains("2_GPUs") || it.key.contains("4_GPUs") || it.key.contains("8_GPUs")) && !it.key.contains("Post-Merge")} println multiGpuJobs.keySet() multiGpuJobsPostMerge = parallelJobs.findAll{(it.key.contains("2_GPUs") || it.key.contains("4_GPUs") || it.key.contains("8_GPUs")) && it.key.contains("Post-Merge")} parallelJobs += docBuildJobs parallelJobs += sanityCheckJobs postMergeJobs = parallelJobs.findAll {it.key.contains("Post-Merge")} // Start as a normal pre-merge job parallelJobsFiltered = parallelJobs - multiGpuJobs - postMergeJobs // Check if the multi GPU related file has changed or not. If changed, add multi GPU test stages. if (testFilter[(MULTI_GPU_FILE_CHANGED)]) { parallelJobsFiltered += multiGpuJobs } if (testFilter[(AUTO_TRIGGER_TAG_LIST)]) { echo "AUTO_TRIGGER_TAG_LIST mode is true. Auto trigger tags: ${testFilter[(AUTO_TRIGGER_TAG_LIST)].join(', ')}." def autoTriggerTagStages = [:] for (tag in testFilter[(AUTO_TRIGGER_TAG_LIST)]) { autoTriggerTagStages += parallelJobs.findAll { it.key.contains(tag) } } parallelJobsFiltered += autoTriggerTagStages if (autoTriggerTagStages.size() > 0) { echo "Auto trigger will force run stages: ${autoTriggerTagStages.keySet().join(', ')}." } println parallelJobsFiltered.keySet() } // Check --post-merge, post-merge or TRT dependency testing pipelines. // If true, add post-merge only test stages and multi-GPU test stages. if (env.alternativeTRT || testFilter[(IS_POST_MERGE)]) { parallelJobsFiltered += multiGpuJobs parallelJobsFiltered += postMergeJobs } // Check --skip-test, only run doc build and sanity check stages. if (testFilter[(ENABLE_SKIP_TEST)]) { echo "All test stages are skipped." parallelJobsFiltered = docBuildJobs + sanityCheckJobs } // Check --add-multi-gpu-test, if true, add multi-GPU test stages back. if (testFilter[(ADD_MULTI_GPU_TEST)]) { parallelJobsFiltered += multiGpuJobs } // Check --only-multi-gpu-test, if true, only run multi-GPU test stages. if (testFilter[(ONLY_MULTI_GPU_TEST)]) { if (testFilter[(IS_POST_MERGE)]) { parallelJobsFiltered = multiGpuJobsPostMerge } else { parallelJobsFiltered = multiGpuJobs } } // Check --disable-multi-gpu-test, if true, remove multi-GPU test stages. if (testFilter[(DISABLE_MULTI_GPU_TEST)]) { parallelJobsFiltered -= multiGpuJobs } // Check --gpu-type, filter test stages. if (testFilter[(GPU_TYPE_LIST)] != null) { echo "Use GPU_TYPE_LIST for filtering. GPU types: ${testFilter[(GPU_TYPE_LIST)]}." parallelJobsFiltered = parallelJobsFiltered.findAll {it.key.tokenize('-')[0] in testFilter[(GPU_TYPE_LIST)]} println parallelJobsFiltered.keySet() } // Check --backend-mode, filter test stages. if (testFilter[(TEST_BACKEND)] != null) { echo "Use TEST_BACKEND for filtering. Backend mode: ${testFilter[(TEST_BACKEND)]}." def backendMode = testFilter[(TEST_BACKEND)].collect { it.toLowerCase() } def changeMap = [ "pytorch": "-PyTorch-", "tensorrt": "-TensorRT-", "cpp": "-CPP-", "triton": "-Triton-", "fmha": "-FMHA-", ] def backendModeList = backendMode.collect { changeMap.get(it) }.flatten() def parallelJobsNoBackend = parallelJobsFiltered.findAll { key, _ -> !changeMap.values().any { backend -> key.contains(backend) } } def parallelJobsBackendMode = parallelJobsFiltered.findAll { key, _ -> backendModeList.any { backend -> key.contains(backend) } } parallelJobsFiltered = parallelJobsNoBackend + parallelJobsBackendMode echo "parallelJobsBackendMode: ${parallelJobsBackendMode.keySet()}" println parallelJobsFiltered.keySet() } if (testFilter[(ONLY_ONE_GROUP_CHANGED)] == "Docs") { echo "Only docs files are changed, run doc build stage only." parallelJobsFiltered = docBuildJobs println parallelJobsFiltered.keySet() } else if (testFilter[(ONLY_ONE_GROUP_CHANGED)] != "") { if (testFilter[(TEST_BACKEND)] != null) { echo "Force disable ONLY_ONE_GROUP_CHANGED mode. Backend mode set by flag: ${testFilter[(TEST_BACKEND)]}." } else { echo "ONLY_ONE_GROUP_CHANGED mode is true. The group is: ${testFilter[(ONLY_ONE_GROUP_CHANGED)]}." def excludedBackends = new HashMap() excludedBackends["PyTorch"] = ["-CPP-", "-TensorRT-", "-FMHA-"] // Only pytorch file change also need to run triton tests excludedBackends["Triton"] = ["-PyTorch-", "-CPP-", "-TensorRT-", "-FMHA-"] excludedBackends["FMHA"] = ["-PyTorch-", "-CPP-", "-TensorRT-", "-Triton-"] def group = testFilter[(ONLY_ONE_GROUP_CHANGED)] if (excludedBackends.containsKey(group)) { parallelJobsFiltered = parallelJobsFiltered.findAll { key, value -> !excludedBackends[group].any { backend -> key.contains(backend) } } } println parallelJobsFiltered.keySet() } } // Check --stage-list, only run the stages in stage-list. if (testFilter[TEST_STAGE_LIST] != null) { echo "Use TEST_STAGE_LIST for filtering. Stages: ${testFilter[(TEST_STAGE_LIST)]}." parallelJobsFiltered = parallelJobs.findAll {it.key in testFilter[(TEST_STAGE_LIST)]} println parallelJobsFiltered.keySet() } // Check --extra-stage, add the stages in extra-stage. if (testFilter[EXTRA_STAGE_LIST] != null) { echo "Use EXTRA_STAGE_LIST for filtering. Stages: ${testFilter[(EXTRA_STAGE_LIST)]}." parallelJobsFiltered += parallelJobs.findAll {it.key in testFilter[(EXTRA_STAGE_LIST)]} println parallelJobsFiltered.keySet() } checkStageName(fullSet) if (testFilter[(TEST_STAGE_LIST)] != null) { checkStageNameSet(testFilter[(TEST_STAGE_LIST)], fullSet, TEST_STAGE_LIST) } if (testFilter[(EXTRA_STAGE_LIST)] != null) { checkStageNameSet(testFilter[(EXTRA_STAGE_LIST)], fullSet, EXTRA_STAGE_LIST) } echo "Check the passed GitLab bot testFilter parameters." def keysStr = parallelJobsFiltered.keySet().join(",\n") pipeline.echo "Now we will run stages: [\n${keysStr}\n]" parallelJobsFiltered = parallelJobsFiltered.collectEntries { key, values -> [key, { stage(key) { if (key in testFilter[REUSE_STAGE_LIST]) { stage("Skip - Reused") { echo "Skip - Passed in the previous pipelines." } } else if (values instanceof List) { trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", { values[1]() }) } else { values() } } }]} return parallelJobsFiltered } def launchTestJobsForImagesSanityCheck(pipeline, globalVars) { def testConfigs = [ "NGC Devel Image amd64": [ name: "NGC-Devel-Image-amd64-Sanity-Test", k8sArch: "amd64", wheelInstalled: false, config: VANILLA_CONFIG, ], "NGC Devel Image arm64": [ name: "NGC-Devel-Image-arm64-Sanity-Test", k8sArch: "arm64", wheelInstalled: false, config: LINUX_AARCH64_CONFIG, ], "NGC Release Image amd64": [ name: "NGC-Release-Image-amd64-Sanity-Test-A10", gpuType: "a10", k8sArch: "amd64", wheelInstalled: true, config: VANILLA_CONFIG, ], "NGC Release Image arm64": [ name: "NGC-Release-Image-arm64-Sanity-Test-GH200", gpuType: "gh200", k8sArch: "arm64", wheelInstalled: true, config: LINUX_AARCH64_CONFIG, ], ] if (!ENABLE_NGC_DEVEL_IMAGE_TEST) { ["NGC Devel Image amd64", "NGC Devel Image arm64"].each { key -> testConfigs.remove(key) } echo "NGC Devel Image test is disabled." } if (!ENABLE_NGC_RELEASE_IMAGE_TEST) { ["NGC Release Image amd64", "NGC Release Image arm64"].each { key -> testConfigs.remove(key) } echo "NGC Release Image test is disabled." } // Update testConfigs image field using the map from globalVars testConfigs.each { key, config -> if (globalVars[IMAGE_KEY_TO_TAG] && globalVars[IMAGE_KEY_TO_TAG][key]) { config.image = globalVars[IMAGE_KEY_TO_TAG][key] } } // Filter out all configs that don't have image set testConfigs = testConfigs.findAll { key, config -> return config.image != null } echo "Filtered test configs with images:" println testConfigs def testJobs = testConfigs.collectEntries { key, values -> [values.name, { if (values.wheelInstalled) { stage(values.name) { echo "Run ${values.name} sanity test." imageSanitySpec = createKubernetesPodConfig(values.image, values.gpuType, values.k8sArch) trtllm_utils.launchKubernetesPod(pipeline, imageSanitySpec, "trt-llm", { sh "env | sort" trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y git rsync curl") runLLMTestlistOnPlatform(pipeline, values.gpuType, "l0_sanity_check", values.config, false, values.name, 1, 1, true, null, "-SubJob-TestImage") }) } } else { stage(values.name) { imageSanitySpec = createKubernetesPodConfig(values.image, "build", values.k8sArch) trtllm_utils.launchKubernetesPod(pipeline, imageSanitySpec, "trt-llm", { sh "env | sort" def cpuArch = values.k8sArch == "amd64" ? X86_64_TRIPLE : AARCH64_TRIPLE runLLMBuild(pipeline, cpuArch, false, "imageTest/") }) } } }]} return testJobs } pipeline { agent { kubernetes createKubernetesPodConfig("", "agent") } options { // Check the valid options at: https://www.jenkins.io/doc/book/pipeline/syntax/ // some step like results analysis stage, does not need to check out source code skipDefaultCheckout() // to better analyze the time for each step/test timestamps() timeout(time: 24, unit: 'HOURS') } environment { //Workspace normally is: /home/jenkins/agent/workspace/LLM/L0_MergeRequest@tmp/ HF_HOME="${env.WORKSPACE_TMP}/.cache/huggingface" CCACHE_DIR="${CCACHE_DIR}" GITHUB_MIRROR="https://urm.nvidia.com/artifactory/github-go-remote" PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple" // force datasets to be offline mode, to prevent CI jobs are downloading HF dataset causing test failures HF_DATASETS_OFFLINE=1 CMAKE_POLICY_VERSION_MINIMUM="3.5" OPEN_SEARCH_DB_BASE_URL=credentials("open_search_db_base_url") OPEN_SEARCH_DB_CREDENTIALS=credentials("open_search_db_credentials") } stages { stage("Setup Environment") { steps { script { echo "enableFailFast is: ${params.enableFailFast}" echo "env.testFilter is: ${env.testFilter}" testFilter = trtllm_utils.updateMapWithJson(this, testFilter, env.testFilter, "testFilter") println testFilter echo "env.globalVars is: ${env.globalVars}" globalVars = trtllm_utils.updateMapWithJson(this, globalVars, env.globalVars, "globalVars") globalVars[ACTION_INFO] = trtllm_utils.setupPipelineDescription(this, globalVars[ACTION_INFO]) } } } stage("Check Test List") { when { expression { // Only run the test list validation when necessary env.targetArch == X86_64_TRIPLE && testFilter[ONLY_ONE_GROUP_CHANGED] != "Docs" && !(env.JOB_NAME ==~ /.*Multi-GPU.*/) && !(env.JOB_NAME ==~ /.*BuildDockerImageSanityTest.*/) } } steps { script { launchTestListCheck(this) } } } stage("Test") { steps { script { if (env.JOB_NAME ==~ /.*BuildDockerImageSanityTest.*/) { parallelJobs = launchTestJobsForImagesSanityCheck(this, globalVars) } else { parallelJobs = launchTestJobs(this, testFilter) } singleGpuJobs = parallelJobs dgxJobs = [:] def testPhase2StageName = env.testPhase2StageName if (testPhase2StageName) { def dgxSigns = ["2_GPUs", "4_GPUs", "8_GPUs"] singleGpuJobs = parallelJobs.findAll{!dgxSigns.any{sign -> it.key.contains(sign)}} dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}} } if (env.JOB_NAME ==~ /.*Single-GPU.*/) { echo "Only run single-GPU tests." if (dgxJobs.size() > 0) { if (globalVars[ACTION_INFO]['parents'].size() > 0) { // We add a special marker to the parent job's description. // This will be used to decide whether to run multi-GPU test stage. def parentJob = globalVars[ACTION_INFO]['parents'][-2] def archStr = (env.targetArch == X86_64_TRIPLE) ? "x86_64" : (env.targetArch == AARCH64_TRIPLE ? "SBSA" : "Unknown") trtllm_utils.appendBuildDescription(this, parentJob['name'], parentJob['build_number'], "====Require ${archStr} Multi-GPU Testing====
") } else { echo "No parent job found to add the special marker for executing multi-GPU test stage." } } else { echo "Skip multi-GPU testing. No test to run." } if (singleGpuJobs.size() > 0) { singleGpuJobs.failFast = params.enableFailFast parallel singleGpuJobs } else { echo "Skip single-GPU testing. No test to run." } } else if (env.JOB_NAME ==~ /.*Multi-GPU.*/) { echo "Only run multi-GPU tests." if (dgxJobs.size() > 0) { dgxJobs.failFast = params.enableFailFast parallel dgxJobs } else { error "Skip multi-GPU testing. No test to run." } } else { if (singleGpuJobs.size() > 0) { singleGpuJobs.failFast = params.enableFailFast parallel singleGpuJobs } else { echo "Skip single-GPU testing. No test to run." } if (dgxJobs.size() > 0) { stage(testPhase2StageName) { dgxJobs.failFast = params.enableFailFast parallel dgxJobs } } } } } } // Test stage } // stages } // pipeline