infra: [TRTLLM-5250] Add sanity check stage for ngc-release images (Build wheels for devel image) (#4656)

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>
Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com>
Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Zhanrui Sun 2025-07-21 16:06:43 +08:00 committed by GitHub
parent 3efad2e58c
commit 3cbc23f783
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 227 additions and 12 deletions

View File

@ -12,6 +12,7 @@ withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LL
LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}"
}
ARTIFACT_PATH = env.artifactPath ? env.artifactPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"
UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"
LLM_ROOT = "llm"
@ -25,6 +26,8 @@ LLM_SHORT_COMMIT = env.gitlabCommit ? env.gitlabCommit.substring(0, 7) : "undefi
LLM_DEFAULT_TAG = env.defaultTag ?: "${LLM_SHORT_COMMIT}-${LLM_BRANCH_TAG}-${BUILD_NUMBER}"
RUN_SANITY_CHECK = params.runSanityCheck ?: false
BUILD_JOBS = "32"
BUILD_JOBS_RELEASE_X86_64 = "32"
BUILD_JOBS_RELEASE_SBSA = "32"
@ -37,10 +40,13 @@ def GITHUB_PR_API_URL = "github_pr_api_url"
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
@Field
def ACTION_INFO = "action_info"
@Field
def IMAGE_KEY_TO_TAG = "image_key_to_tag"
def globalVars = [
(GITHUB_PR_API_URL): null,
(CACHED_CHANGED_FILE_LIST): null,
(ACTION_INFO): null,
(IMAGE_KEY_TO_TAG): [:],
]
@Field
@ -203,15 +209,11 @@ def buildImage(config, imageKeyToTag)
def dependentImageWithTag = "${IMAGE_NAME}/${dependent.dockerfileStage}:${dependentTag}"
def customImageWithTag = "${IMAGE_NAME}/${dockerfileStage}:${customTag}"
if (target == "ngc-release") {
if (params.triggerType == "post-merge") {
echo "Use NGC artifacts for post merge build"
dependentImageWithTag = "${NGC_IMAGE_NAME}:${dependentTag}"
imageWithTag = "${NGC_IMAGE_NAME}:${tag}"
customImageWithTag = "${NGC_IMAGE_NAME}:${customTag}"
}
imageKeyToTag["NGC Devel Image ${config.arch}"] = dependentImageWithTag
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
if (target == "ngc-release" && params.triggerType == "post-merge") {
echo "Use NGC artifacts for post merge build"
dependentImageWithTag = "${NGC_IMAGE_NAME}:${dependentTag}"
imageWithTag = "${NGC_IMAGE_NAME}:${tag}"
customImageWithTag = "${NGC_IMAGE_NAME}:${customTag}"
}
args += " GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote"
@ -266,6 +268,9 @@ def buildImage(config, imageKeyToTag)
"""
}
args += " DEVEL_IMAGE=${dependentImageWithTag}"
if (target == "ngc-release") {
imageKeyToTag["NGC Devel Image ${config.arch}"] = dependentImageWithTag
}
}
}
@ -290,6 +295,9 @@ def buildImage(config, imageKeyToTag)
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
"""
}
if (target == "ngc-release") {
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
}
}
if (customTag) {
@ -429,6 +437,17 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
}
def getCommonParameters()
{
return [
'gitlabSourceRepoHttpUrl': LLM_REPO,
'gitlabCommit': env.gitlabCommit,
'artifactPath': ARTIFACT_PATH,
'uploadPath': UPLOAD_PATH,
]
}
pipeline {
agent {
kubernetes createKubernetesPodConfig("agent")
@ -494,7 +513,100 @@ pipeline {
}
}
}
stage("Register Images for Security Checks") {
stage("Wait for Build Jobs Complete") {
when {
expression {
RUN_SANITY_CHECK
}
}
steps {
script {
container("python3") {
// Install wget
trtllm_utils.llmExecStepWithRetry(this, script: "apt-get update && apt-get -y install wget")
// Poll for build artifacts
def artifactBaseUrl = "https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/"
def requiredFiles = [
"TensorRT-LLM-GH200.tar.gz",
"TensorRT-LLM.tar.gz"
]
def maxWaitMinutes = 60
def pollIntervalSeconds = 60
echo "Waiting for build artifacts..."
echo "Required files: ${requiredFiles}"
def startTime = System.currentTimeMillis()
def maxWaitMs = maxWaitMinutes * 60 * 1000
while ((System.currentTimeMillis() - startTime) < maxWaitMs) {
def missingFiles = []
for (file in requiredFiles) {
def fileUrl = "${artifactBaseUrl}${file}"
def exitCode = sh(
script: "wget --spider --quiet --timeout=30 --tries=1 '${fileUrl}'",
returnStatus: true
)
if (exitCode != 0) {
missingFiles.add(file)
}
}
if (missingFiles.isEmpty()) {
echo "All build artifacts are ready!"
return
}
def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
echo "Waiting... (${elapsedMinutes.intValue()} minutes elapsed)"
echo "Missing files: ${missingFiles}"
sleep(pollIntervalSeconds)
}
def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
error "Timeout waiting for build artifacts (${elapsedMinutes.intValue()} minutes)"
}
}
}
}
stage("Sanity Check for NGC Images") {
when {
expression {
RUN_SANITY_CHECK
}
}
steps {
script {
globalVars[IMAGE_KEY_TO_TAG] = imageKeyToTag
String globalVarsJson = writeJSON returnText: true, json: globalVars
def parameters = getCommonParameters()
parameters += [
'enableFailFast': false,
'globalVars': globalVarsJson,
]
echo "Trigger BuildDockerImageSanityTest job, params: ${parameters}"
def status = ""
def jobName = "/LLM/helpers/BuildDockerImageSanityTest"
def handle = build(
job: jobName,
parameters: trtllm_utils.toBuildParameters(parameters),
propagate: false,
)
echo "Triggered job: ${handle.absoluteUrl}"
status = handle.result
if (status != "SUCCESS") {
error "Downstream job did not succeed"
}
}
}
}
stage("Register NGC Images for Security Checks") {
when {
expression {
return params.nspect_id && params.action == "push"

View File

@ -142,10 +142,13 @@ def GITHUB_PR_API_URL = "github_pr_api_url"
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
@Field
def ACTION_INFO = "action_info"
@Field
def IMAGE_KEY_TO_TAG = "image_key_to_tag"
def globalVars = [
(GITHUB_PR_API_URL): gitlabParamsFromBot.get('github_pr_api_url', null),
(CACHED_CHANGED_FILE_LIST): null,
(ACTION_INFO): gitlabParamsFromBot.get('action_info', null),
(IMAGE_KEY_TO_TAG): [:],
]
// If not running all test stages in the L0 pre-merge, we will not update the GitLab status at the end.
@ -1091,6 +1094,7 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
'branch': branch,
'action': "push",
'triggerType': env.JOB_NAME ==~ /.*PostMerge.*/ ? "post-merge" : "pre-merge",
'runSanityCheck': true,
]
launchJob("/LLM/helpers/BuildDockerImages", false, enableFailFast, globalVars, "x86_64", additionalParameters)

View File

@ -95,6 +95,10 @@ TESTER_MEMORY = "96Gi"
CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
// ENABLE_NGC_DEVEL_IMAGE_TEST is currently disabled in the Jenkins BuildDockerImageSanityTest job config
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false
def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def remote = [
@ -474,10 +478,13 @@ def GITHUB_PR_API_URL = "github_pr_api_url"
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
@Field
def ACTION_INFO = "action_info"
@Field
def IMAGE_KEY_TO_TAG = "image_key_to_tag"
def globalVars = [
(GITHUB_PR_API_URL): null,
(CACHED_CHANGED_FILE_LIST): null,
(ACTION_INFO): null,
(IMAGE_KEY_TO_TAG): [:],
]
String getShortenedJobName(String path)
@ -490,6 +497,7 @@ String getShortenedJobName(String path)
"L1_Custom": "l1-cus",
"L1_Nightly": "l1-nt",
"L1_Stable": "l1-stb",
"BuildDockerImageSanityTest": "img-check",
]
def parts = path.split('/')
// Apply nameMapping to the last part (jobName)
@ -2264,6 +2272,90 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
return parallelJobsFiltered
}
def launchTestJobsForImagesSanityCheck(pipeline, globalVars) {
def testConfigs = [
"NGC Devel Image amd64": [
name: "NGC-Devel-Image-amd64-Sanity-Test",
k8sArch: "amd64",
wheelInstalled: false,
config: VANILLA_CONFIG,
],
"NGC Devel Image arm64": [
name: "NGC-Devel-Image-arm64-Sanity-Test",
k8sArch: "arm64",
wheelInstalled: false,
config: LINUX_AARCH64_CONFIG,
],
"NGC Release Image amd64": [
name: "NGC-Release-Image-amd64-Sanity-Test-A10",
gpuType: "a10",
k8sArch: "amd64",
wheelInstalled: true,
config: VANILLA_CONFIG,
],
"NGC Release Image arm64": [
name: "NGC-Release-Image-arm64-Sanity-Test-GH200",
gpuType: "gh200",
k8sArch: "arm64",
wheelInstalled: true,
config: LINUX_AARCH64_CONFIG,
],
]
if (!ENABLE_NGC_DEVEL_IMAGE_TEST) {
["NGC Devel Image amd64", "NGC Devel Image arm64"].each { key ->
testConfigs.remove(key)
}
echo "NGC Devel Image test is disabled."
}
if (!ENABLE_NGC_RELEASE_IMAGE_TEST) {
["NGC Release Image amd64", "NGC Release Image arm64"].each { key ->
testConfigs.remove(key)
}
echo "NGC Release Image test is disabled."
}
// Update testConfigs image field using the map from globalVars
testConfigs.each { key, config ->
if (globalVars[IMAGE_KEY_TO_TAG] && globalVars[IMAGE_KEY_TO_TAG][key]) {
config.image = globalVars[IMAGE_KEY_TO_TAG][key]
}
}
// Filter out all configs that don't have image set
testConfigs = testConfigs.findAll { key, config ->
return config.image != null
}
echo "Filtered test configs with images:"
println testConfigs
def testJobs = testConfigs.collectEntries { key, values -> [values.name, {
if (values.wheelInstalled) {
stage(values.name) {
echo "Run ${values.name} sanity test."
imageSanitySpec = createKubernetesPodConfig(values.image, values.gpuType, values.k8sArch)
trtllm_utils.launchKubernetesPod(pipeline, imageSanitySpec, "trt-llm", {
sh "env | sort"
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y git rsync curl")
runLLMTestlistOnPlatform(pipeline, values.gpuType, "l0_sanity_check", values.config, false, values.name , 1, 1, true, null)
})
}
} else {
stage(values.name) {
imageSanitySpec = createKubernetesPodConfig(values.image, "build", values.k8sArch)
trtllm_utils.launchKubernetesPod(pipeline, imageSanitySpec, "trt-llm", {
sh "env | sort"
def cpuArch = values.k8sArch == "amd64" ? X86_64_TRIPLE : AARCH64_TRIPLE
runLLMBuild(pipeline, cpuArch, false, "imageTest/")
})
}
}
}]}
return testJobs
}
pipeline {
agent {
kubernetes createKubernetesPodConfig("", "agent")
@ -2306,7 +2398,10 @@ pipeline {
when {
expression {
// Only run the test list validation when necessary
env.targetArch == X86_64_TRIPLE && testFilter[ONLY_DOCS_FILE_CHANGED] == false && !(env.JOB_NAME ==~ /.*Multi-GPU.*/)
env.targetArch == X86_64_TRIPLE &&
testFilter[ONLY_DOCS_FILE_CHANGED] == false &&
!(env.JOB_NAME ==~ /.*Multi-GPU.*/) &&
!(env.JOB_NAME ==~ /.*BuildDockerImageSanityTest.*/)
}
}
steps
@ -2319,7 +2414,11 @@ pipeline {
stage("Test") {
steps {
script {
parallelJobs = launchTestJobs(this, testFilter)
if (env.JOB_NAME ==~ /.*BuildDockerImageSanityTest.*/) {
parallelJobs = launchTestJobsForImagesSanityCheck(this, globalVars)
} else {
parallelJobs = launchTestJobs(this, testFilter)
}
singleGpuJobs = parallelJobs
dgxJobs = [:]