TensorRT-LLMs/jenkins/BuildDockerImage.groovy
Zhanrui Sun 587a36db96
infra: [TRTLLM-4370] Fix the build error when build GH200 image (#3229)
* infra: Fix the build error when build GH200 image

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>

* remove and update checkoutSource function

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>

---------

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>
2025-04-03 17:33:50 +08:00

323 lines
11 KiB
Groovy

@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
import java.lang.Exception
import groovy.transform.Field
// Docker image registry
IMAGE_NAME = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging"
// LLM repository configuration
withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LLM_REPO')]) {
LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}"
}
LLM_ROOT = "llm"
LLM_BRANCH = env.gitlabBranch? env.gitlabBranch : params.branch
LLM_BRANCH_TAG = LLM_BRANCH.replaceAll('/', '_')
BUILD_JOBS = "32"
def createKubernetesPodConfig(type)
{
def targetCould = "kubernetes-cpu"
def containerConfig = ""
switch(type)
{
case "agent":
containerConfig = """
- name: alpine
image: urm.nvidia.com/docker/alpine:latest
command: ['cat']
tty: true
resources:
requests:
cpu: '2'
memory: 10Gi
ephemeral-storage: 25Gi
limits:
cpu: '2'
memory: 10Gi
ephemeral-storage: 25Gi
imagePullPolicy: Always"""
break
case "build":
containerConfig = """
- name: docker
image: urm.nvidia.com/docker/docker:dind
tty: true
resources:
requests:
cpu: 16
memory: 72Gi
ephemeral-storage: 200Gi
limits:
cpu: 16
memory: 256Gi
ephemeral-storage: 200Gi
imagePullPolicy: Always
securityContext:
privileged: true
capabilities:
add:
- SYS_ADMIN"""
break
}
def podConfig = [
cloud: targetCould,
namespace: "sw-tensorrt",
yaml: """
apiVersion: v1
kind: Pod
spec:
qosClass: Guaranteed
nodeSelector:
nvidia.com/node_type: builder
kubernetes.io/os: linux
containers:
${containerConfig}
- name: jnlp
image: urm.nvidia.com/docker/jenkins/inbound-agent:4.11-1-jdk11
args: ['\$(JENKINS_SECRET)', '\$(JENKINS_NAME)']
resources:
requests:
cpu: '2'
memory: 10Gi
ephemeral-storage: 25Gi
limits:
cpu: '2'
memory: 10Gi
ephemeral-storage: 25Gi
""".stripIndent(),
]
return podConfig
}
def buildImage(target, action="build", torchInstallType="skip", args="", custom_tag="", post_tag="")
{
def tag = "x86_64-${target}-torch_${torchInstallType}-${LLM_BRANCH_TAG}-${BUILD_NUMBER}${post_tag}"
// Step 1: cloning tekit source code
// allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
trtllm_utils.checkoutSource(LLM_REPO, LLM_BRANCH, LLM_ROOT, true, true)
// Step 2: building wheels in container
container("docker") {
stage ("Install packages") {
sh "pwd && ls -alh"
sh "env"
sh "apk add make git"
sh "git config --global --add safe.directory '*'"
withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
sh "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}"
}
withCredentials([
usernamePassword(
credentialsId: "svc_tensorrt_gitlab_read_api_token",
usernameVariable: 'USERNAME',
passwordVariable: 'PASSWORD'
),
string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
]) {
sh "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}"
}
}
try {
containerGenFailure = null
stage ("make ${target}_${action}") {
retry(3)
{
sh """
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
TORCH_INSTALL_TYPE=${torchInstallType} \
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${tag} \
BUILD_WHEEL_OPTS='-j ${BUILD_JOBS}' ${args} \
GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote
"""
}
}
if (custom_tag) {
stage ("custom tag: ${custom_tag}") {
sh """
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
TORCH_INSTALL_TYPE=${torchInstallType} \
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${custom_tag} \
BUILD_WHEEL_OPTS='-j ${BUILD_JOBS}' ${args} \
GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote
"""
}
}
} catch (Exception ex) {
containerGenFailure = ex
} finally {
stage ("Docker logout") {
withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) {
sh "docker logout urm.nvidia.com"
sh "docker logout ${DEFAULT_GIT_URL}:5005"
}
}
if (containerGenFailure != null) {
throw containerGenFailure
}
}
}
}
def triggerSBSARemoteJob(action, type)
{
script
{
def parameters = """
token=L1_Nightly_Token
hostJobName=${JOB_NAME}
hostBuildNumber=${BUILD_NUMBER}
gitlabBranch=${LLM_BRANCH}
action=${action}
type=${type}
""".stripIndent()
catchError(buildResult: 'FAILURE', stageResult: 'FAILURE')
{
def handle = triggerRemoteJob(
job: "https://prod.blsm.nvidia.com/sw-tensorrt-static-1/job/LLM/job/helpers/job/gh200-BuildImage/",
auth: CredentialsAuth(credentials: "STATIC_1_TOKEN"),
parameters: parameters,
pollInterval: 60,
abortTriggeredJob: true,
)
def status = handle.getBuildResult().toString()
if (status != "SUCCESS") {
error "Downstream job did not succeed"
}
}
}
}
pipeline {
agent {
kubernetes createKubernetesPodConfig("agent")
}
parameters {
string(
name: "branch",
defaultValue: "main",
description: "Branch to launch job."
)
choice(
name: "action",
choices: ["build", "push"],
description: "Docker image generation action. build: only perform image build step; push: build docker image and push it to artifacts"
)
}
options {
// Check the valid options at: https://www.jenkins.io/doc/book/pipeline/syntax/
// some step like results analysis stage, does not need to check out source code
skipDefaultCheckout()
// to better analyze the time for each step/test
timestamps()
timeout(time: 24, unit: 'HOURS')
}
environment {
PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
}
stages {
stage("Build")
{
parallel {
stage("Build trtllm release") {
agent {
kubernetes createKubernetesPodConfig("build")
}
steps
{
buildImage("trtllm", "push", "skip", "", LLM_BRANCH_TAG)
}
}
stage("Build x86_64-skip") {
agent {
kubernetes createKubernetesPodConfig("build")
}
steps
{
buildImage("devel", params.action, "skip")
}
}
stage("Build x86_64-pre_cxx11_abi") {
agent {
kubernetes createKubernetesPodConfig("build")
}
steps
{
buildImage("devel", params.action, "src_non_cxx11_abi")
}
}
stage("Build x86_64-cxx11_abi") {
agent {
kubernetes createKubernetesPodConfig("build")
}
steps
{
buildImage("devel", params.action, "src_cxx11_abi")
}
}
stage("Build rockylinux8 x86_64-skip-py3.10") {
agent {
kubernetes createKubernetesPodConfig("build")
}
steps
{
buildImage("rockylinux8", params.action, "skip", "PYTHON_VERSION=3.10.12", "", "-py310")
}
}
stage("Build rockylinux8 x86_64-skip-py3.12") {
agent {
kubernetes createKubernetesPodConfig("build")
}
steps
{
buildImage("rockylinux8", params.action, "skip", "PYTHON_VERSION=3.12.3", "", "-py312")
}
}
stage("Build SBSA-skip") {
agent {
kubernetes createKubernetesPodConfig("agent")
}
steps
{
triggerSBSARemoteJob(params.action, "skip")
}
}
// Waived due to a pytorch issue: https://github.com/pytorch/pytorch/issues/141083
// stage("Build SBSA-pre_cxx11_abi") {
// agent {
// kubernetes createKubernetesPodConfig("agent")
// }
// steps
// {
// triggerSBSARemoteJob(params.action, "src_non_cxx11_abi")
// }
// }
// stage("Build SBSA-cxx11_abi") {
// agent {
// kubernetes createKubernetesPodConfig("agent")
// }
// steps
// {
// triggerSBSARemoteJob(params.action, "src_cxx11_abi")
// }
// }
}
}
} // stages
} // pipeline