mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
infra: [TRTLLM-5072] Add SBSA release images (#4231)
* infra: [TRTLLM-5072] Add SBSA release images and move SBSA to blossom Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> * Fix review Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> * Easy to review Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> * Fix BUILD_JOBS Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> * Use gitlab mirror for nixl and ucx Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> * Update BuildDockerImage.groovy Signed-off-by: Yanchao Lu <yanchaol@nvidia.com> --------- Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> Signed-off-by: Yanchao Lu <yanchaol@nvidia.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
fb663b637a
commit
17d48e0009
@ -29,6 +29,11 @@ DOCKER_BUILD_OPTS ?= --pull --load
|
||||
DOCKER_BUILD_ARGS ?=
|
||||
DOCKER_PROGRESS ?= auto
|
||||
CUDA_ARCHS ?=
|
||||
PLATFORM ?= $(shell uname -m | grep -q 'aarch64' && echo "arm64" || echo "amd64")
|
||||
ifeq ($(PLATFORM), arm64)
|
||||
CUDA_ARCHS = '90-real;100-real;120-real'
|
||||
endif
|
||||
|
||||
BUILD_WHEEL_OPTS ?=
|
||||
BUILD_WHEEL_ARGS ?= $(shell grep 'ARG BUILD_WHEEL_ARGS=' Dockerfile.multi | grep -o '=.*' | tr -d '="')$(if $(CUDA_ARCHS), --cuda_architectures $(CUDA_ARCHS))$(if $(BUILD_WHEEL_OPTS), $(BUILD_WHEEL_OPTS))
|
||||
TORCH_INSTALL_TYPE ?= skip
|
||||
@ -42,7 +47,6 @@ TRT_LLM_VERSION ?= $(shell grep '^__version__' ../tensorrt_llm/version.py | g
|
||||
GITHUB_MIRROR ?=
|
||||
PYTHON_VERSION ?=
|
||||
NGC_STAGING_REPO ?= nvcr.io/nvstaging/tensorrt-llm
|
||||
PLATFORM ?= $(shell uname -m | grep -q 'aarch64' && echo "arm64" || echo "amd64")
|
||||
|
||||
define add_local_user
|
||||
docker build \
|
||||
@ -178,9 +182,14 @@ ubuntu22_%: BASE_TAG = 12.9.0-devel-ubuntu22.04
|
||||
|
||||
trtllm_%: STAGE = release
|
||||
trtllm_%: PUSH_TO_STAGING := 0
|
||||
trtllm_%: DEVEL_IMAGE = $(shell grep 'LLM_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
|
||||
trtllm_%: DEVEL_IMAGE = $(shell \
|
||||
if [ "$(PLATFORM)" = "amd64" ]; then \
|
||||
grep 'LLM_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"'; \
|
||||
elif [ "$(PLATFORM)" = "arm64" ]; then \
|
||||
grep 'LLM_SBSA_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"'; \
|
||||
fi)
|
||||
trtllm_%: IMAGE_NAME = $(shell grep 'IMAGE_NAME = ' ../jenkins/BuildDockerImage.groovy | grep -o '".*"' | tr -d '"')
|
||||
trtllm_%: IMAGE_TAG = $(shell git rev-parse --abbrev-ref HEAD | tr '/' '_')
|
||||
trtllm_%: IMAGE_TAG = $(shell git rev-parse --abbrev-ref HEAD | tr '/' '_')-$(PLATFORM)
|
||||
trtllm_run: WORK_DIR = /app/tensorrt_llm
|
||||
|
||||
# This requires a docker installation with multi-platform support
|
||||
|
||||
@ -6,8 +6,21 @@ GITHUB_URL="https://github.com"
|
||||
UCX_VERSION="v1.18.1"
|
||||
UCX_INSTALL_PATH="/usr/local/ucx/"
|
||||
|
||||
NIXL_VERSION="0.2.0"
|
||||
|
||||
UCX_REPO="https://github.com/openucx/ucx.git"
|
||||
NIXL_REPO="https://github.com/ai-dynamo/nixl.git"
|
||||
|
||||
UCX_MIRROR="https://gitlab-master.nvidia.com/ftp/GitHubSync/ucx.git"
|
||||
NIXL_MIRROR="https://gitlab-master.nvidia.com/ftp/GitHubSync/nixl.git"
|
||||
|
||||
if [ -n "${GITHUB_MIRROR}" ]; then
|
||||
UCX_REPO=${UCX_MIRROR}
|
||||
NIXL_REPO=${NIXL_MIRROR}
|
||||
fi
|
||||
|
||||
if [ ! -d ${UCX_INSTALL_PATH} ]; then
|
||||
git clone --depth 1 -b ${UCX_VERSION} https://github.com/openucx/ucx.git
|
||||
git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO}
|
||||
cd ucx
|
||||
./autogen.sh
|
||||
./contrib/configure-release --prefix=${UCX_INSTALL_PATH}
|
||||
@ -17,9 +30,6 @@ if [ ! -d ${UCX_INSTALL_PATH} ]; then
|
||||
echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
|
||||
fi
|
||||
|
||||
NIXL_VERSION="0.2.0"
|
||||
NIXL_REPO="${GITHUB_URL}/ai-dynamo/nixl.git"
|
||||
|
||||
ARCH_NAME="x86_64-linux-gnu"
|
||||
if [ "$(uname -m)" != "amd64" ] && [ "$(uname -m)" != "x86_64" ]; then
|
||||
ARCH_NAME="aarch64-linux-gnu"
|
||||
|
||||
@ -16,8 +16,10 @@ LLM_BRANCH = env.gitlabBranch? env.gitlabBranch : params.branch
|
||||
LLM_BRANCH_TAG = LLM_BRANCH.replaceAll('/', '_')
|
||||
|
||||
BUILD_JOBS = "32"
|
||||
BUILD_JOBS_RELEASE_X86_64 = "16"
|
||||
BUILD_JOBS_RELEASE_SBSA = "8"
|
||||
|
||||
def createKubernetesPodConfig(type)
|
||||
def createKubernetesPodConfig(type, arch = "amd64")
|
||||
{
|
||||
def targetCould = "kubernetes-cpu"
|
||||
def containerConfig = ""
|
||||
@ -75,6 +77,7 @@ def createKubernetesPodConfig(type)
|
||||
nodeSelector:
|
||||
nvidia.com/node_type: builder
|
||||
kubernetes.io/os: linux
|
||||
kubernetes.io/arch: ${arch}
|
||||
containers:
|
||||
${containerConfig}
|
||||
- name: jnlp
|
||||
@ -96,9 +99,10 @@ def createKubernetesPodConfig(type)
|
||||
}
|
||||
|
||||
|
||||
def buildImage(target, action="build", torchInstallType="skip", args="", custom_tag="", post_tag="")
|
||||
def buildImage(target, action="build", torchInstallType="skip", args="", custom_tag="", post_tag="", is_sbsa=false)
|
||||
{
|
||||
def tag = "x86_64-${target}-torch_${torchInstallType}${post_tag}-${LLM_BRANCH_TAG}-${BUILD_NUMBER}"
|
||||
def arch = is_sbsa ? "sbsa" : "x86_64"
|
||||
def tag = "${arch}-${target}-torch_${torchInstallType}${post_tag}-${LLM_BRANCH_TAG}-${BUILD_NUMBER}"
|
||||
|
||||
// Step 1: cloning tekit source code
|
||||
// allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
|
||||
@ -128,15 +132,31 @@ def buildImage(target, action="build", torchInstallType="skip", args="", custom_
|
||||
}
|
||||
}
|
||||
try {
|
||||
// Fix the build OOM issue of release builds
|
||||
def build_jobs = BUILD_JOBS
|
||||
if (target == "trtllm") {
|
||||
if (arch == "x86_64") {
|
||||
build_jobs = BUILD_JOBS_RELEASE_X86_64
|
||||
} else {
|
||||
build_jobs = BUILD_JOBS_RELEASE_SBSA
|
||||
}
|
||||
}
|
||||
containerGenFailure = null
|
||||
stage ("make ${target}_${action}") {
|
||||
retry(3)
|
||||
{
|
||||
// Fix the triton image pull timeout issue
|
||||
def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
|
||||
def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
|
||||
retry(3) {
|
||||
sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}"
|
||||
}
|
||||
|
||||
sh """
|
||||
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
||||
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${tag} \
|
||||
BUILD_WHEEL_OPTS='-j ${BUILD_JOBS}' ${args} \
|
||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} \
|
||||
GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote
|
||||
"""
|
||||
}
|
||||
@ -148,7 +168,7 @@ def buildImage(target, action="build", torchInstallType="skip", args="", custom_
|
||||
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
||||
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${custom_tag} \
|
||||
BUILD_WHEEL_OPTS='-j ${BUILD_JOBS}' ${args} \
|
||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} \
|
||||
GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote
|
||||
"""
|
||||
}
|
||||
@ -170,38 +190,6 @@ def buildImage(target, action="build", torchInstallType="skip", args="", custom_
|
||||
}
|
||||
|
||||
|
||||
def triggerSBSARemoteJob(action, type)
|
||||
{
|
||||
script
|
||||
{
|
||||
def parameters = """
|
||||
token=L1_Nightly_Token
|
||||
hostJobName=${JOB_NAME}
|
||||
hostBuildNumber=${BUILD_NUMBER}
|
||||
gitlabBranch=${LLM_BRANCH}
|
||||
action=${action}
|
||||
type=${type}
|
||||
""".stripIndent()
|
||||
|
||||
catchError(buildResult: 'FAILURE', stageResult: 'FAILURE')
|
||||
{
|
||||
def handle = triggerRemoteJob(
|
||||
job: "https://prod.blsm.nvidia.com/sw-tensorrt-static-1/job/LLM/job/helpers/job/gh200-BuildImage/",
|
||||
auth: CredentialsAuth(credentials: "STATIC_1_TOKEN"),
|
||||
parameters: parameters,
|
||||
pollInterval: 60,
|
||||
abortTriggeredJob: true,
|
||||
)
|
||||
def status = handle.getBuildResult().toString()
|
||||
|
||||
if (status != "SUCCESS") {
|
||||
error "Downstream job did not succeed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pipeline {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("agent")
|
||||
@ -240,7 +228,7 @@ pipeline {
|
||||
}
|
||||
steps
|
||||
{
|
||||
buildImage("trtllm", "push", "skip", "", LLM_BRANCH_TAG)
|
||||
buildImage("trtllm", params.action, "skip", "", LLM_BRANCH_TAG)
|
||||
}
|
||||
}
|
||||
stage("Build x86_64-skip") {
|
||||
@ -252,22 +240,13 @@ pipeline {
|
||||
buildImage("tritondevel", params.action, "skip")
|
||||
}
|
||||
}
|
||||
stage("Build x86_64-pre_cxx11_abi") {
|
||||
stage("Build trtllm release-sbsa") {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("build")
|
||||
kubernetes createKubernetesPodConfig("build", "arm64")
|
||||
}
|
||||
steps
|
||||
{
|
||||
buildImage("devel", params.action, "src_non_cxx11_abi")
|
||||
}
|
||||
}
|
||||
stage("Build x86_64-cxx11_abi") {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("build")
|
||||
}
|
||||
steps
|
||||
{
|
||||
buildImage("devel", params.action, "src_cxx11_abi")
|
||||
buildImage("trtllm", params.action, "skip", "", LLM_BRANCH_TAG + "-sbsa", "", true)
|
||||
}
|
||||
}
|
||||
stage("Build rockylinux8 x86_64-skip-py3.10") {
|
||||
@ -290,29 +269,11 @@ pipeline {
|
||||
}
|
||||
stage("Build SBSA-skip") {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("agent")
|
||||
kubernetes createKubernetesPodConfig("build", "arm64")
|
||||
}
|
||||
steps
|
||||
{
|
||||
triggerSBSARemoteJob(params.action, "skip")
|
||||
}
|
||||
}
|
||||
stage("Build SBSA-pre_cxx11_abi") {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("agent")
|
||||
}
|
||||
steps
|
||||
{
|
||||
triggerSBSARemoteJob(params.action, "src_non_cxx11_abi")
|
||||
}
|
||||
}
|
||||
stage("Build SBSA-cxx11_abi") {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("agent")
|
||||
}
|
||||
steps
|
||||
{
|
||||
triggerSBSARemoteJob(params.action, "src_cxx11_abi")
|
||||
buildImage("tritondevel", params.action, "skip", "", "", "", true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,123 +0,0 @@
|
||||
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
|
||||
|
||||
import java.lang.Exception
|
||||
import groovy.transform.Field
|
||||
|
||||
// Docker image registry
|
||||
DOCKER_IMAGE = "docker:dind"
|
||||
IMAGE_NAME = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging"
|
||||
|
||||
// LLM repository configuration
|
||||
withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LLM_REPO')]) {
|
||||
LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}"
|
||||
}
|
||||
LLM_ROOT = "llm"
|
||||
|
||||
def buildImage(action, type)
|
||||
{
|
||||
def branch = env.gitlabBranch
|
||||
def branchTag = branch.replaceAll('/', '_')
|
||||
def buildNumber = env.hostBuildNumber ? env.hostBuildNumber : BUILD_NUMBER
|
||||
def stage_docker = "tritondevel"
|
||||
def tag = "sbsa-${stage_docker}-torch_${type}-${branchTag}-${buildNumber}"
|
||||
|
||||
// Step 1: cloning tekit source code
|
||||
// allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
|
||||
stage('Prepare') {
|
||||
echo "hostJobName: ${env.hostJobName}"
|
||||
echo "hostBuildNumber: ${env.hostBuildNumber}"
|
||||
echo "gitlabBranch: ${env.gitlabBranch}"
|
||||
echo "action: ${env.action}"
|
||||
echo "type: ${env.type}"
|
||||
sh 'pwd'
|
||||
sh 'ls -lah'
|
||||
sh 'rm -rf ./*'
|
||||
sh 'ls -lah'
|
||||
}
|
||||
|
||||
trtllm_utils.checkoutSource(LLM_REPO, branch, LLM_ROOT, true, true)
|
||||
|
||||
// Step 2: building wheels in container
|
||||
docker.image(DOCKER_IMAGE).inside('-v /var/run/docker.sock:/var/run/docker.sock --privileged') {
|
||||
stage ("Install packages") {
|
||||
sh "pwd && ls -alh"
|
||||
sh "env"
|
||||
sh "apk add make git"
|
||||
sh "git config --global --add safe.directory '*'"
|
||||
|
||||
withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
sh "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}"
|
||||
}
|
||||
|
||||
withCredentials([
|
||||
usernamePassword(
|
||||
credentialsId: "svc_tensorrt_gitlab_read_api_token",
|
||||
usernameVariable: 'USERNAME',
|
||||
passwordVariable: 'PASSWORD'
|
||||
),
|
||||
string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
|
||||
]) {
|
||||
sh "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}"
|
||||
}
|
||||
}
|
||||
try {
|
||||
containerGenFailure = null
|
||||
// stage ("Generate Image") {
|
||||
// retry(3)
|
||||
// {
|
||||
// sh "cd ${LLM_ROOT} && make -C docker release_build TORCH_INSTALL_TYPE=${type}" +
|
||||
// " GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote"
|
||||
// }
|
||||
// }
|
||||
stage ("Perform '${action}' action on image") {
|
||||
retry(3)
|
||||
{
|
||||
sh """cd ${LLM_ROOT} && make -C docker ${stage_docker}_${action} \
|
||||
IMAGE_NAME=${IMAGE_NAME} \
|
||||
IMAGE_TAG=${tag} \
|
||||
TORCH_INSTALL_TYPE=${type} \
|
||||
GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote"""
|
||||
}
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
containerGenFailure = ex
|
||||
} finally {
|
||||
stage ("Docker logout") {
|
||||
withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) {
|
||||
sh "docker logout urm.nvidia.com"
|
||||
sh "docker logout ${DEFAULT_GIT_URL}:5005"
|
||||
}
|
||||
}
|
||||
if (containerGenFailure != null) {
|
||||
throw containerGenFailure
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pipeline {
|
||||
agent {
|
||||
label 'sbsa-a100-80gb-pcie-x4||sbsa-gh200-480gb'
|
||||
}
|
||||
options {
|
||||
// Check the valid options at: https://www.jenkins.io/doc/book/pipeline/syntax/
|
||||
// some step like results analysis stage, does not need to check out source code
|
||||
skipDefaultCheckout()
|
||||
// to better analyze the time for each step/test
|
||||
timestamps()
|
||||
timeout(time: 24, unit: 'HOURS')
|
||||
}
|
||||
environment {
|
||||
PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
|
||||
}
|
||||
stages {
|
||||
stage("Build")
|
||||
{
|
||||
steps
|
||||
{
|
||||
buildImage(env.action, env.type)
|
||||
}
|
||||
}
|
||||
} // stages
|
||||
} // pipeline
|
||||
Loading…
Reference in New Issue
Block a user