mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
infra: [TRTLLM-5250] Add sanity check stage for ngc-release images (Build wheels for devel image) (#4656)
Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
3efad2e58c
commit
3cbc23f783
@ -12,6 +12,7 @@ withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LL
|
||||
LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}"
|
||||
}
|
||||
|
||||
ARTIFACT_PATH = env.artifactPath ? env.artifactPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"
|
||||
UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"
|
||||
|
||||
LLM_ROOT = "llm"
|
||||
@ -25,6 +26,8 @@ LLM_SHORT_COMMIT = env.gitlabCommit ? env.gitlabCommit.substring(0, 7) : "undefi
|
||||
|
||||
LLM_DEFAULT_TAG = env.defaultTag ?: "${LLM_SHORT_COMMIT}-${LLM_BRANCH_TAG}-${BUILD_NUMBER}"
|
||||
|
||||
RUN_SANITY_CHECK = params.runSanityCheck ?: false
|
||||
|
||||
BUILD_JOBS = "32"
|
||||
BUILD_JOBS_RELEASE_X86_64 = "32"
|
||||
BUILD_JOBS_RELEASE_SBSA = "32"
|
||||
@ -37,10 +40,13 @@ def GITHUB_PR_API_URL = "github_pr_api_url"
|
||||
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
|
||||
@Field
|
||||
def ACTION_INFO = "action_info"
|
||||
@Field
|
||||
def IMAGE_KEY_TO_TAG = "image_key_to_tag"
|
||||
def globalVars = [
|
||||
(GITHUB_PR_API_URL): null,
|
||||
(CACHED_CHANGED_FILE_LIST): null,
|
||||
(ACTION_INFO): null,
|
||||
(IMAGE_KEY_TO_TAG): [:],
|
||||
]
|
||||
|
||||
@Field
|
||||
@ -203,15 +209,11 @@ def buildImage(config, imageKeyToTag)
|
||||
def dependentImageWithTag = "${IMAGE_NAME}/${dependent.dockerfileStage}:${dependentTag}"
|
||||
def customImageWithTag = "${IMAGE_NAME}/${dockerfileStage}:${customTag}"
|
||||
|
||||
if (target == "ngc-release") {
|
||||
if (params.triggerType == "post-merge") {
|
||||
echo "Use NGC artifacts for post merge build"
|
||||
dependentImageWithTag = "${NGC_IMAGE_NAME}:${dependentTag}"
|
||||
imageWithTag = "${NGC_IMAGE_NAME}:${tag}"
|
||||
customImageWithTag = "${NGC_IMAGE_NAME}:${customTag}"
|
||||
}
|
||||
imageKeyToTag["NGC Devel Image ${config.arch}"] = dependentImageWithTag
|
||||
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
|
||||
if (target == "ngc-release" && params.triggerType == "post-merge") {
|
||||
echo "Use NGC artifacts for post merge build"
|
||||
dependentImageWithTag = "${NGC_IMAGE_NAME}:${dependentTag}"
|
||||
imageWithTag = "${NGC_IMAGE_NAME}:${tag}"
|
||||
customImageWithTag = "${NGC_IMAGE_NAME}:${customTag}"
|
||||
}
|
||||
|
||||
args += " GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote"
|
||||
@ -266,6 +268,9 @@ def buildImage(config, imageKeyToTag)
|
||||
"""
|
||||
}
|
||||
args += " DEVEL_IMAGE=${dependentImageWithTag}"
|
||||
if (target == "ngc-release") {
|
||||
imageKeyToTag["NGC Devel Image ${config.arch}"] = dependentImageWithTag
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -290,6 +295,9 @@ def buildImage(config, imageKeyToTag)
|
||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
|
||||
"""
|
||||
}
|
||||
if (target == "ngc-release") {
|
||||
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
|
||||
}
|
||||
}
|
||||
|
||||
if (customTag) {
|
||||
@ -429,6 +437,17 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
|
||||
}
|
||||
|
||||
|
||||
def getCommonParameters()
|
||||
{
|
||||
return [
|
||||
'gitlabSourceRepoHttpUrl': LLM_REPO,
|
||||
'gitlabCommit': env.gitlabCommit,
|
||||
'artifactPath': ARTIFACT_PATH,
|
||||
'uploadPath': UPLOAD_PATH,
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
pipeline {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("agent")
|
||||
@ -494,7 +513,100 @@ pipeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Register Images for Security Checks") {
|
||||
stage("Wait for Build Jobs Complete") {
|
||||
when {
|
||||
expression {
|
||||
RUN_SANITY_CHECK
|
||||
}
|
||||
}
|
||||
steps {
|
||||
script {
|
||||
container("python3") {
|
||||
// Install wget
|
||||
trtllm_utils.llmExecStepWithRetry(this, script: "apt-get update && apt-get -y install wget")
|
||||
|
||||
// Poll for build artifacts
|
||||
def artifactBaseUrl = "https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/"
|
||||
def requiredFiles = [
|
||||
"TensorRT-LLM-GH200.tar.gz",
|
||||
"TensorRT-LLM.tar.gz"
|
||||
]
|
||||
def maxWaitMinutes = 60
|
||||
def pollIntervalSeconds = 60
|
||||
|
||||
echo "Waiting for build artifacts..."
|
||||
echo "Required files: ${requiredFiles}"
|
||||
|
||||
def startTime = System.currentTimeMillis()
|
||||
def maxWaitMs = maxWaitMinutes * 60 * 1000
|
||||
|
||||
while ((System.currentTimeMillis() - startTime) < maxWaitMs) {
|
||||
def missingFiles = []
|
||||
|
||||
for (file in requiredFiles) {
|
||||
def fileUrl = "${artifactBaseUrl}${file}"
|
||||
def exitCode = sh(
|
||||
script: "wget --spider --quiet --timeout=30 --tries=1 '${fileUrl}'",
|
||||
returnStatus: true
|
||||
)
|
||||
|
||||
if (exitCode != 0) {
|
||||
missingFiles.add(file)
|
||||
}
|
||||
}
|
||||
|
||||
if (missingFiles.isEmpty()) {
|
||||
echo "All build artifacts are ready!"
|
||||
return
|
||||
}
|
||||
|
||||
def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
|
||||
echo "Waiting... (${elapsedMinutes.intValue()} minutes elapsed)"
|
||||
echo "Missing files: ${missingFiles}"
|
||||
sleep(pollIntervalSeconds)
|
||||
}
|
||||
|
||||
def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
|
||||
error "Timeout waiting for build artifacts (${elapsedMinutes.intValue()} minutes)"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Sanity Check for NGC Images") {
|
||||
when {
|
||||
expression {
|
||||
RUN_SANITY_CHECK
|
||||
}
|
||||
}
|
||||
steps {
|
||||
script {
|
||||
globalVars[IMAGE_KEY_TO_TAG] = imageKeyToTag
|
||||
String globalVarsJson = writeJSON returnText: true, json: globalVars
|
||||
def parameters = getCommonParameters()
|
||||
parameters += [
|
||||
'enableFailFast': false,
|
||||
'globalVars': globalVarsJson,
|
||||
]
|
||||
|
||||
echo "Trigger BuildDockerImageSanityTest job, params: ${parameters}"
|
||||
|
||||
def status = ""
|
||||
def jobName = "/LLM/helpers/BuildDockerImageSanityTest"
|
||||
def handle = build(
|
||||
job: jobName,
|
||||
parameters: trtllm_utils.toBuildParameters(parameters),
|
||||
propagate: false,
|
||||
)
|
||||
echo "Triggered job: ${handle.absoluteUrl}"
|
||||
status = handle.result
|
||||
|
||||
if (status != "SUCCESS") {
|
||||
error "Downstream job did not succeed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Register NGC Images for Security Checks") {
|
||||
when {
|
||||
expression {
|
||||
return params.nspect_id && params.action == "push"
|
||||
|
||||
@ -142,10 +142,13 @@ def GITHUB_PR_API_URL = "github_pr_api_url"
|
||||
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
|
||||
@Field
|
||||
def ACTION_INFO = "action_info"
|
||||
@Field
|
||||
def IMAGE_KEY_TO_TAG = "image_key_to_tag"
|
||||
def globalVars = [
|
||||
(GITHUB_PR_API_URL): gitlabParamsFromBot.get('github_pr_api_url', null),
|
||||
(CACHED_CHANGED_FILE_LIST): null,
|
||||
(ACTION_INFO): gitlabParamsFromBot.get('action_info', null),
|
||||
(IMAGE_KEY_TO_TAG): [:],
|
||||
]
|
||||
|
||||
// If not running all test stages in the L0 pre-merge, we will not update the GitLab status at the end.
|
||||
@ -1091,6 +1094,7 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
||||
'branch': branch,
|
||||
'action': "push",
|
||||
'triggerType': env.JOB_NAME ==~ /.*PostMerge.*/ ? "post-merge" : "pre-merge",
|
||||
'runSanityCheck': true,
|
||||
]
|
||||
|
||||
launchJob("/LLM/helpers/BuildDockerImages", false, enableFailFast, globalVars, "x86_64", additionalParameters)
|
||||
|
||||
@ -95,6 +95,10 @@ TESTER_MEMORY = "96Gi"
|
||||
CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
|
||||
MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
|
||||
|
||||
// ENABLE_NGC_DEVEL_IMAGE_TEST is currently disabled in the Jenkins BuildDockerImageSanityTest job config
|
||||
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
|
||||
ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false
|
||||
|
||||
def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
|
||||
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
def remote = [
|
||||
@ -474,10 +478,13 @@ def GITHUB_PR_API_URL = "github_pr_api_url"
|
||||
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
|
||||
@Field
|
||||
def ACTION_INFO = "action_info"
|
||||
@Field
|
||||
def IMAGE_KEY_TO_TAG = "image_key_to_tag"
|
||||
def globalVars = [
|
||||
(GITHUB_PR_API_URL): null,
|
||||
(CACHED_CHANGED_FILE_LIST): null,
|
||||
(ACTION_INFO): null,
|
||||
(IMAGE_KEY_TO_TAG): [:],
|
||||
]
|
||||
|
||||
String getShortenedJobName(String path)
|
||||
@ -490,6 +497,7 @@ String getShortenedJobName(String path)
|
||||
"L1_Custom": "l1-cus",
|
||||
"L1_Nightly": "l1-nt",
|
||||
"L1_Stable": "l1-stb",
|
||||
"BuildDockerImageSanityTest": "img-check",
|
||||
]
|
||||
def parts = path.split('/')
|
||||
// Apply nameMapping to the last part (jobName)
|
||||
@ -2264,6 +2272,90 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
|
||||
return parallelJobsFiltered
|
||||
}
|
||||
|
||||
|
||||
|
||||
def launchTestJobsForImagesSanityCheck(pipeline, globalVars) {
|
||||
def testConfigs = [
|
||||
"NGC Devel Image amd64": [
|
||||
name: "NGC-Devel-Image-amd64-Sanity-Test",
|
||||
k8sArch: "amd64",
|
||||
wheelInstalled: false,
|
||||
config: VANILLA_CONFIG,
|
||||
],
|
||||
"NGC Devel Image arm64": [
|
||||
name: "NGC-Devel-Image-arm64-Sanity-Test",
|
||||
k8sArch: "arm64",
|
||||
wheelInstalled: false,
|
||||
config: LINUX_AARCH64_CONFIG,
|
||||
],
|
||||
"NGC Release Image amd64": [
|
||||
name: "NGC-Release-Image-amd64-Sanity-Test-A10",
|
||||
gpuType: "a10",
|
||||
k8sArch: "amd64",
|
||||
wheelInstalled: true,
|
||||
config: VANILLA_CONFIG,
|
||||
],
|
||||
"NGC Release Image arm64": [
|
||||
name: "NGC-Release-Image-arm64-Sanity-Test-GH200",
|
||||
gpuType: "gh200",
|
||||
k8sArch: "arm64",
|
||||
wheelInstalled: true,
|
||||
config: LINUX_AARCH64_CONFIG,
|
||||
],
|
||||
]
|
||||
if (!ENABLE_NGC_DEVEL_IMAGE_TEST) {
|
||||
["NGC Devel Image amd64", "NGC Devel Image arm64"].each { key ->
|
||||
testConfigs.remove(key)
|
||||
}
|
||||
echo "NGC Devel Image test is disabled."
|
||||
}
|
||||
if (!ENABLE_NGC_RELEASE_IMAGE_TEST) {
|
||||
["NGC Release Image amd64", "NGC Release Image arm64"].each { key ->
|
||||
testConfigs.remove(key)
|
||||
}
|
||||
echo "NGC Release Image test is disabled."
|
||||
}
|
||||
// Update testConfigs image field using the map from globalVars
|
||||
testConfigs.each { key, config ->
|
||||
if (globalVars[IMAGE_KEY_TO_TAG] && globalVars[IMAGE_KEY_TO_TAG][key]) {
|
||||
config.image = globalVars[IMAGE_KEY_TO_TAG][key]
|
||||
}
|
||||
}
|
||||
// Filter out all configs that don't have image set
|
||||
testConfigs = testConfigs.findAll { key, config ->
|
||||
return config.image != null
|
||||
}
|
||||
|
||||
echo "Filtered test configs with images:"
|
||||
println testConfigs
|
||||
|
||||
def testJobs = testConfigs.collectEntries { key, values -> [values.name, {
|
||||
if (values.wheelInstalled) {
|
||||
stage(values.name) {
|
||||
echo "Run ${values.name} sanity test."
|
||||
imageSanitySpec = createKubernetesPodConfig(values.image, values.gpuType, values.k8sArch)
|
||||
trtllm_utils.launchKubernetesPod(pipeline, imageSanitySpec, "trt-llm", {
|
||||
sh "env | sort"
|
||||
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y git rsync curl")
|
||||
runLLMTestlistOnPlatform(pipeline, values.gpuType, "l0_sanity_check", values.config, false, values.name , 1, 1, true, null)
|
||||
})
|
||||
}
|
||||
} else {
|
||||
stage(values.name) {
|
||||
imageSanitySpec = createKubernetesPodConfig(values.image, "build", values.k8sArch)
|
||||
trtllm_utils.launchKubernetesPod(pipeline, imageSanitySpec, "trt-llm", {
|
||||
sh "env | sort"
|
||||
def cpuArch = values.k8sArch == "amd64" ? X86_64_TRIPLE : AARCH64_TRIPLE
|
||||
runLLMBuild(pipeline, cpuArch, false, "imageTest/")
|
||||
})
|
||||
}
|
||||
}
|
||||
}]}
|
||||
|
||||
return testJobs
|
||||
}
|
||||
|
||||
|
||||
pipeline {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("", "agent")
|
||||
@ -2306,7 +2398,10 @@ pipeline {
|
||||
when {
|
||||
expression {
|
||||
// Only run the test list validation when necessary
|
||||
env.targetArch == X86_64_TRIPLE && testFilter[ONLY_DOCS_FILE_CHANGED] == false && !(env.JOB_NAME ==~ /.*Multi-GPU.*/)
|
||||
env.targetArch == X86_64_TRIPLE &&
|
||||
testFilter[ONLY_DOCS_FILE_CHANGED] == false &&
|
||||
!(env.JOB_NAME ==~ /.*Multi-GPU.*/) &&
|
||||
!(env.JOB_NAME ==~ /.*BuildDockerImageSanityTest.*/)
|
||||
}
|
||||
}
|
||||
steps
|
||||
@ -2319,7 +2414,11 @@ pipeline {
|
||||
stage("Test") {
|
||||
steps {
|
||||
script {
|
||||
parallelJobs = launchTestJobs(this, testFilter)
|
||||
if (env.JOB_NAME ==~ /.*BuildDockerImageSanityTest.*/) {
|
||||
parallelJobs = launchTestJobsForImagesSanityCheck(this, globalVars)
|
||||
} else {
|
||||
parallelJobs = launchTestJobs(this, testFilter)
|
||||
}
|
||||
|
||||
singleGpuJobs = parallelJobs
|
||||
dgxJobs = [:]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user