infra: [TRTLLM-5072] Add SBSA release images (#4231)

* infra: [TRTLLM-5072] Add SBSA release images and move SBSA to blossom

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>

* Fix review

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>

* Easy to review

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>

* Fix BUILD_JOBS

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>

* Use gitlab mirror for nixl and ucx

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>

* Update BuildDockerImage.groovy

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>

---------

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>
Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Zhanrui Sun 2025-05-18 00:00:06 +08:00 committed by GitHub
parent fb663b637a
commit 17d48e0009
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 57 additions and 200 deletions

View File

@ -29,6 +29,11 @@ DOCKER_BUILD_OPTS ?= --pull --load
DOCKER_BUILD_ARGS ?=
DOCKER_PROGRESS ?= auto
CUDA_ARCHS ?=
PLATFORM ?= $(shell uname -m | grep -q 'aarch64' && echo "arm64" || echo "amd64")
ifeq ($(PLATFORM), arm64)
CUDA_ARCHS = '90-real;100-real;120-real'
endif
BUILD_WHEEL_OPTS ?=
BUILD_WHEEL_ARGS ?= $(shell grep 'ARG BUILD_WHEEL_ARGS=' Dockerfile.multi | grep -o '=.*' | tr -d '="')$(if $(CUDA_ARCHS), --cuda_architectures $(CUDA_ARCHS))$(if $(BUILD_WHEEL_OPTS), $(BUILD_WHEEL_OPTS))
TORCH_INSTALL_TYPE ?= skip
@ -42,7 +47,6 @@ TRT_LLM_VERSION ?= $(shell grep '^__version__' ../tensorrt_llm/version.py | g
GITHUB_MIRROR ?=
PYTHON_VERSION ?=
NGC_STAGING_REPO ?= nvcr.io/nvstaging/tensorrt-llm
PLATFORM ?= $(shell uname -m | grep -q 'aarch64' && echo "arm64" || echo "amd64")
define add_local_user
docker build \
@ -178,9 +182,14 @@ ubuntu22_%: BASE_TAG = 12.9.0-devel-ubuntu22.04
trtllm_%: STAGE = release
trtllm_%: PUSH_TO_STAGING := 0
trtllm_%: DEVEL_IMAGE = $(shell grep 'LLM_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
trtllm_%: DEVEL_IMAGE = $(shell \
if [ "$(PLATFORM)" = "amd64" ]; then \
grep 'LLM_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"'; \
elif [ "$(PLATFORM)" = "arm64" ]; then \
grep 'LLM_SBSA_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"'; \
fi)
trtllm_%: IMAGE_NAME = $(shell grep 'IMAGE_NAME = ' ../jenkins/BuildDockerImage.groovy | grep -o '".*"' | tr -d '"')
trtllm_%: IMAGE_TAG = $(shell git rev-parse --abbrev-ref HEAD | tr '/' '_')
trtllm_%: IMAGE_TAG = $(shell git rev-parse --abbrev-ref HEAD | tr '/' '_')-$(PLATFORM)
trtllm_run: WORK_DIR = /app/tensorrt_llm
# This requires a docker installation with multi-platform support

View File

@ -6,8 +6,21 @@ GITHUB_URL="https://github.com"
UCX_VERSION="v1.18.1"
UCX_INSTALL_PATH="/usr/local/ucx/"
NIXL_VERSION="0.2.0"
UCX_REPO="https://github.com/openucx/ucx.git"
NIXL_REPO="https://github.com/ai-dynamo/nixl.git"
UCX_MIRROR="https://gitlab-master.nvidia.com/ftp/GitHubSync/ucx.git"
NIXL_MIRROR="https://gitlab-master.nvidia.com/ftp/GitHubSync/nixl.git"
if [ -n "${GITHUB_MIRROR}" ]; then
UCX_REPO=${UCX_MIRROR}
NIXL_REPO=${NIXL_MIRROR}
fi
if [ ! -d ${UCX_INSTALL_PATH} ]; then
git clone --depth 1 -b ${UCX_VERSION} https://github.com/openucx/ucx.git
git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO}
cd ucx
./autogen.sh
./contrib/configure-release --prefix=${UCX_INSTALL_PATH}
@ -17,9 +30,6 @@ if [ ! -d ${UCX_INSTALL_PATH} ]; then
echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
fi
NIXL_VERSION="0.2.0"
NIXL_REPO="${GITHUB_URL}/ai-dynamo/nixl.git"
ARCH_NAME="x86_64-linux-gnu"
if [ "$(uname -m)" != "amd64" ] && [ "$(uname -m)" != "x86_64" ]; then
ARCH_NAME="aarch64-linux-gnu"

View File

@ -16,8 +16,10 @@ LLM_BRANCH = env.gitlabBranch? env.gitlabBranch : params.branch
LLM_BRANCH_TAG = LLM_BRANCH.replaceAll('/', '_')
BUILD_JOBS = "32"
BUILD_JOBS_RELEASE_X86_64 = "16"
BUILD_JOBS_RELEASE_SBSA = "8"
def createKubernetesPodConfig(type)
def createKubernetesPodConfig(type, arch = "amd64")
{
def targetCould = "kubernetes-cpu"
def containerConfig = ""
@ -75,6 +77,7 @@ def createKubernetesPodConfig(type)
nodeSelector:
nvidia.com/node_type: builder
kubernetes.io/os: linux
kubernetes.io/arch: ${arch}
containers:
${containerConfig}
- name: jnlp
@ -96,9 +99,10 @@ def createKubernetesPodConfig(type)
}
def buildImage(target, action="build", torchInstallType="skip", args="", custom_tag="", post_tag="")
def buildImage(target, action="build", torchInstallType="skip", args="", custom_tag="", post_tag="", is_sbsa=false)
{
def tag = "x86_64-${target}-torch_${torchInstallType}${post_tag}-${LLM_BRANCH_TAG}-${BUILD_NUMBER}"
def arch = is_sbsa ? "sbsa" : "x86_64"
def tag = "${arch}-${target}-torch_${torchInstallType}${post_tag}-${LLM_BRANCH_TAG}-${BUILD_NUMBER}"
// Step 1: cloning tekit source code
// allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
@ -128,15 +132,31 @@ def buildImage(target, action="build", torchInstallType="skip", args="", custom_
}
}
try {
// Fix the build OOM issue of release builds
def build_jobs = BUILD_JOBS
if (target == "trtllm") {
if (arch == "x86_64") {
build_jobs = BUILD_JOBS_RELEASE_X86_64
} else {
build_jobs = BUILD_JOBS_RELEASE_SBSA
}
}
containerGenFailure = null
stage ("make ${target}_${action}") {
retry(3)
{
// Fix the triton image pull timeout issue
def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
retry(3) {
sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}"
}
sh """
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
TORCH_INSTALL_TYPE=${torchInstallType} \
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${tag} \
BUILD_WHEEL_OPTS='-j ${BUILD_JOBS}' ${args} \
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} \
GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote
"""
}
@ -148,7 +168,7 @@ def buildImage(target, action="build", torchInstallType="skip", args="", custom_
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
TORCH_INSTALL_TYPE=${torchInstallType} \
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${custom_tag} \
BUILD_WHEEL_OPTS='-j ${BUILD_JOBS}' ${args} \
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} \
GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote
"""
}
@ -170,38 +190,6 @@ def buildImage(target, action="build", torchInstallType="skip", args="", custom_
}
def triggerSBSARemoteJob(action, type)
{
script
{
def parameters = """
token=L1_Nightly_Token
hostJobName=${JOB_NAME}
hostBuildNumber=${BUILD_NUMBER}
gitlabBranch=${LLM_BRANCH}
action=${action}
type=${type}
""".stripIndent()
catchError(buildResult: 'FAILURE', stageResult: 'FAILURE')
{
def handle = triggerRemoteJob(
job: "https://prod.blsm.nvidia.com/sw-tensorrt-static-1/job/LLM/job/helpers/job/gh200-BuildImage/",
auth: CredentialsAuth(credentials: "STATIC_1_TOKEN"),
parameters: parameters,
pollInterval: 60,
abortTriggeredJob: true,
)
def status = handle.getBuildResult().toString()
if (status != "SUCCESS") {
error "Downstream job did not succeed"
}
}
}
}
pipeline {
agent {
kubernetes createKubernetesPodConfig("agent")
@ -240,7 +228,7 @@ pipeline {
}
steps
{
buildImage("trtllm", "push", "skip", "", LLM_BRANCH_TAG)
buildImage("trtllm", params.action, "skip", "", LLM_BRANCH_TAG)
}
}
stage("Build x86_64-skip") {
@ -252,22 +240,13 @@ pipeline {
buildImage("tritondevel", params.action, "skip")
}
}
stage("Build x86_64-pre_cxx11_abi") {
stage("Build trtllm release-sbsa") {
agent {
kubernetes createKubernetesPodConfig("build")
kubernetes createKubernetesPodConfig("build", "arm64")
}
steps
{
buildImage("devel", params.action, "src_non_cxx11_abi")
}
}
stage("Build x86_64-cxx11_abi") {
agent {
kubernetes createKubernetesPodConfig("build")
}
steps
{
buildImage("devel", params.action, "src_cxx11_abi")
buildImage("trtllm", params.action, "skip", "", LLM_BRANCH_TAG + "-sbsa", "", true)
}
}
stage("Build rockylinux8 x86_64-skip-py3.10") {
@ -290,29 +269,11 @@ pipeline {
}
stage("Build SBSA-skip") {
agent {
kubernetes createKubernetesPodConfig("agent")
kubernetes createKubernetesPodConfig("build", "arm64")
}
steps
{
triggerSBSARemoteJob(params.action, "skip")
}
}
stage("Build SBSA-pre_cxx11_abi") {
agent {
kubernetes createKubernetesPodConfig("agent")
}
steps
{
triggerSBSARemoteJob(params.action, "src_non_cxx11_abi")
}
}
stage("Build SBSA-cxx11_abi") {
agent {
kubernetes createKubernetesPodConfig("agent")
}
steps
{
triggerSBSARemoteJob(params.action, "src_cxx11_abi")
buildImage("tritondevel", params.action, "skip", "", "", "", true)
}
}
}

View File

@ -1,123 +0,0 @@
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
import java.lang.Exception
import groovy.transform.Field
// Docker image registry
DOCKER_IMAGE = "docker:dind"
IMAGE_NAME = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging"
// LLM repository configuration
withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LLM_REPO')]) {
LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}"
}
LLM_ROOT = "llm"
def buildImage(action, type)
{
def branch = env.gitlabBranch
def branchTag = branch.replaceAll('/', '_')
def buildNumber = env.hostBuildNumber ? env.hostBuildNumber : BUILD_NUMBER
def stage_docker = "tritondevel"
def tag = "sbsa-${stage_docker}-torch_${type}-${branchTag}-${buildNumber}"
// Step 1: cloning tekit source code
// allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
stage('Prepare') {
echo "hostJobName: ${env.hostJobName}"
echo "hostBuildNumber: ${env.hostBuildNumber}"
echo "gitlabBranch: ${env.gitlabBranch}"
echo "action: ${env.action}"
echo "type: ${env.type}"
sh 'pwd'
sh 'ls -lah'
sh 'rm -rf ./*'
sh 'ls -lah'
}
trtllm_utils.checkoutSource(LLM_REPO, branch, LLM_ROOT, true, true)
// Step 2: building wheels in container
docker.image(DOCKER_IMAGE).inside('-v /var/run/docker.sock:/var/run/docker.sock --privileged') {
stage ("Install packages") {
sh "pwd && ls -alh"
sh "env"
sh "apk add make git"
sh "git config --global --add safe.directory '*'"
withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
sh "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}"
}
withCredentials([
usernamePassword(
credentialsId: "svc_tensorrt_gitlab_read_api_token",
usernameVariable: 'USERNAME',
passwordVariable: 'PASSWORD'
),
string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
]) {
sh "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}"
}
}
try {
containerGenFailure = null
// stage ("Generate Image") {
// retry(3)
// {
// sh "cd ${LLM_ROOT} && make -C docker release_build TORCH_INSTALL_TYPE=${type}" +
// " GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote"
// }
// }
stage ("Perform '${action}' action on image") {
retry(3)
{
sh """cd ${LLM_ROOT} && make -C docker ${stage_docker}_${action} \
IMAGE_NAME=${IMAGE_NAME} \
IMAGE_TAG=${tag} \
TORCH_INSTALL_TYPE=${type} \
GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote"""
}
}
} catch (Exception ex) {
containerGenFailure = ex
} finally {
stage ("Docker logout") {
withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) {
sh "docker logout urm.nvidia.com"
sh "docker logout ${DEFAULT_GIT_URL}:5005"
}
}
if (containerGenFailure != null) {
throw containerGenFailure
}
}
}
}
pipeline {
agent {
label 'sbsa-a100-80gb-pcie-x4||sbsa-gh200-480gb'
}
options {
// Check the valid options at: https://www.jenkins.io/doc/book/pipeline/syntax/
// some step like results analysis stage, does not need to check out source code
skipDefaultCheckout()
// to better analyze the time for each step/test
timestamps()
timeout(time: 24, unit: 'HOURS')
}
environment {
PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
}
stages {
stage("Build")
{
steps
{
buildImage(env.action, env.type)
}
}
} // stages
} // pipeline