mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
infra: [TRTLLM-5247][TRTLLM-5248][TRTLLM-5249] Refactor docker build image groovy and support NGC images (#4294)
Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
058f83e47b
commit
7b2b657198
@ -117,7 +117,7 @@ RUN mkdir -p /root/.cache/pip /root/.cache/ccache
|
|||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
# Build the TRT-LLM wheel
|
# Build the TRT-LLM wheel
|
||||||
ARG BUILD_WHEEL_ARGS="--clean --python_bindings --benchmarks"
|
ARG BUILD_WHEEL_ARGS="--clean --python_bindings --benchmarks"
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip --mount=type=cache,target=/root/.cache/ccache \
|
RUN --mount=type=cache,target=/root/.cache/pip --mount=type=cache,target=${CCACHE_DIR} \
|
||||||
python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS}
|
python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS}
|
||||||
|
|
||||||
FROM ${DEVEL_IMAGE} AS release
|
FROM ${DEVEL_IMAGE} AS release
|
||||||
|
|||||||
@ -161,24 +161,24 @@ release_run: WORK_DIR = /app/tensorrt_llm
|
|||||||
|
|
||||||
# For x86_64
|
# For x86_64
|
||||||
jenkins_%: IMAGE_WITH_TAG = $(shell grep 'LLM_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
|
jenkins_%: IMAGE_WITH_TAG = $(shell grep 'LLM_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
|
||||||
jenkins_%: STAGE = devel
|
jenkins_%: STAGE = tritondevel
|
||||||
|
|
||||||
# For aarch64
|
# For aarch64
|
||||||
jenkins-aarch64_%: IMAGE_WITH_TAG = $(shell grep 'LLM_SBSA_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
|
jenkins-aarch64_%: IMAGE_WITH_TAG = $(shell grep 'LLM_SBSA_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
|
||||||
jenkins-aarch64_%: STAGE = devel
|
jenkins-aarch64_%: STAGE = tritondevel
|
||||||
|
|
||||||
# For x86_64
|
# For x86_64
|
||||||
jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell grep 'LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
|
jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell grep 'LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
|
||||||
jenkins-rockylinux8_%: STAGE = devel
|
jenkins-rockylinux8_%: STAGE = tritondevel
|
||||||
jenkins-rockylinux8_%: BASE_IMAGE = nvidia/cuda
|
jenkins-rockylinux8_%: BASE_IMAGE = nvidia/cuda
|
||||||
jenkins-rockylinux8_%: BASE_TAG = 12.9.0-devel-rockylinux8
|
jenkins-rockylinux8_%: BASE_TAG = 12.9.0-devel-rockylinux8
|
||||||
|
|
||||||
rockylinux8_%: STAGE = devel
|
rockylinux8_%: STAGE = tritondevel
|
||||||
rockylinux8_%: BASE_IMAGE = nvidia/cuda
|
rockylinux8_%: BASE_IMAGE = nvidia/cuda
|
||||||
rockylinux8_%: BASE_TAG = 12.9.0-devel-rockylinux8
|
rockylinux8_%: BASE_TAG = 12.9.0-devel-rockylinux8
|
||||||
|
|
||||||
# For x86_64 and aarch64
|
# For x86_64 and aarch64
|
||||||
ubuntu22_%: STAGE = devel
|
ubuntu22_%: STAGE = tritondevel
|
||||||
ubuntu22_%: BASE_IMAGE = nvidia/cuda
|
ubuntu22_%: BASE_IMAGE = nvidia/cuda
|
||||||
ubuntu22_%: BASE_TAG = 12.9.0-devel-ubuntu22.04
|
ubuntu22_%: BASE_TAG = 12.9.0-devel-ubuntu22.04
|
||||||
|
|
||||||
|
|||||||
@ -12,17 +12,72 @@ withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LL
|
|||||||
}
|
}
|
||||||
LLM_ROOT = "llm"
|
LLM_ROOT = "llm"
|
||||||
|
|
||||||
LLM_BRANCH = env.gitlabBranch? env.gitlabBranch : params.branch
|
LLM_BRANCH = env.gitlabBranch ?: params.branch
|
||||||
LLM_BRANCH_TAG = LLM_BRANCH.replaceAll('/', '_')
|
LLM_BRANCH_TAG = LLM_BRANCH.replaceAll('/', '_')
|
||||||
|
|
||||||
BUILD_JOBS = "32"
|
LLM_COMMIT_OR_BRANCH = env.gitlabCommit ?: LLM_BRANCH
|
||||||
BUILD_JOBS_RELEASE_X86_64 = "16"
|
|
||||||
BUILD_JOBS_RELEASE_SBSA = "8"
|
|
||||||
|
|
||||||
def createKubernetesPodConfig(type, arch = "amd64")
|
LLM_SHORT_COMMIT = env.gitlabCommit ? env.gitlabCommit.substring(0, 7) : "undefined"
|
||||||
|
|
||||||
|
LLM_DEFAULT_TAG = env.defaultTag ?: "${LLM_SHORT_COMMIT}-${LLM_BRANCH_TAG}-${BUILD_NUMBER}"
|
||||||
|
|
||||||
|
BUILD_JOBS = "32"
|
||||||
|
BUILD_JOBS_RELEASE_X86_64 = "32"
|
||||||
|
BUILD_JOBS_RELEASE_SBSA = "32"
|
||||||
|
|
||||||
|
CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
|
||||||
|
|
||||||
|
@Field
|
||||||
|
def GITHUB_PR_API_URL = "github_pr_api_url"
|
||||||
|
@Field
|
||||||
|
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
|
||||||
|
@Field
|
||||||
|
def ACTION_INFO = "action_info"
|
||||||
|
def globalVars = [
|
||||||
|
(GITHUB_PR_API_URL): null,
|
||||||
|
(CACHED_CHANGED_FILE_LIST): null,
|
||||||
|
(ACTION_INFO): null,
|
||||||
|
]
|
||||||
|
|
||||||
|
@Field
|
||||||
|
def imageKeyToTag = [:]
|
||||||
|
|
||||||
|
def createKubernetesPodConfig(type, arch = "amd64", build_wheel = false)
|
||||||
{
|
{
|
||||||
def targetCould = "kubernetes-cpu"
|
def targetCould = "kubernetes-cpu"
|
||||||
def containerConfig = ""
|
def containerConfig = ""
|
||||||
|
def selectors = """
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/node_type: builder
|
||||||
|
kubernetes.io/os: linux
|
||||||
|
kubernetes.io/arch: ${arch}"""
|
||||||
|
|
||||||
|
if (build_wheel && arch == "arm64") {
|
||||||
|
// For aarch64, we need to use hostname to fix the ucxx issue when building wheels
|
||||||
|
selectors += """
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: "kubernetes.io/hostname"
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- "rl300-0008.ipp2u1.colossus"
|
||||||
|
- "rl300-0014.ipp2u1.colossus"
|
||||||
|
- "rl300-0023.ipp2u1.colossus"
|
||||||
|
- "rl300-0024.ipp2u1.colossus"
|
||||||
|
- "rl300-0030.ipp2u1.colossus"
|
||||||
|
- "rl300-0040.ipp2u1.colossus"
|
||||||
|
- "rl300-0041.ipp2u1.colossus"
|
||||||
|
- "rl300-0042.ipp2u1.colossus"
|
||||||
|
- "rl300-0043.ipp2u1.colossus"
|
||||||
|
- "rl300-0044.ipp2u1.colossus"
|
||||||
|
- "rl300-0045.ipp2u1.colossus"
|
||||||
|
- "rl300-0046.ipp2u1.colossus"
|
||||||
|
- "rl300-0047.ipp2u1.colossus"
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
switch(type)
|
switch(type)
|
||||||
{
|
{
|
||||||
@ -44,9 +99,10 @@ def createKubernetesPodConfig(type, arch = "amd64")
|
|||||||
imagePullPolicy: Always"""
|
imagePullPolicy: Always"""
|
||||||
break
|
break
|
||||||
case "build":
|
case "build":
|
||||||
|
// Use a customized docker:dind image with essential dependencies
|
||||||
containerConfig = """
|
containerConfig = """
|
||||||
- name: docker
|
- name: docker
|
||||||
image: urm.nvidia.com/docker/docker:dind
|
image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:202505221445_docker_dind_withbash
|
||||||
tty: true
|
tty: true
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
@ -65,19 +121,34 @@ def createKubernetesPodConfig(type, arch = "amd64")
|
|||||||
- SYS_ADMIN"""
|
- SYS_ADMIN"""
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
def pvcVolume = """
|
||||||
|
- name: sw-tensorrt-pvc
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: sw-tensorrt-pvc
|
||||||
|
"""
|
||||||
|
if (arch == "arm64") {
|
||||||
|
// PVC mount isn't supported on aarch64 platform. Use NFS as a WAR.
|
||||||
|
pvcVolume = """
|
||||||
|
- name: sw-tensorrt-pvc
|
||||||
|
nfs:
|
||||||
|
server: 10.117.145.13
|
||||||
|
path: /vol/scratch1/scratch.svc_tensorrt_blossom
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
def nodeLabelPrefix = "cpu"
|
||||||
|
def jobName = "llm-build-images"
|
||||||
|
def buildID = env.BUILD_ID
|
||||||
|
def nodeLabel = trtllm_utils.appendRandomPostfix("${nodeLabelPrefix}---tensorrt-${jobName}-${buildID}")
|
||||||
def podConfig = [
|
def podConfig = [
|
||||||
cloud: targetCould,
|
cloud: targetCould,
|
||||||
namespace: "sw-tensorrt",
|
namespace: "sw-tensorrt",
|
||||||
|
label: nodeLabel,
|
||||||
yaml: """
|
yaml: """
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Pod
|
kind: Pod
|
||||||
spec:
|
spec:
|
||||||
qosClass: Guaranteed
|
qosClass: Guaranteed
|
||||||
nodeSelector:
|
${selectors}
|
||||||
nvidia.com/node_type: builder
|
|
||||||
kubernetes.io/os: linux
|
|
||||||
kubernetes.io/arch: ${arch}
|
|
||||||
containers:
|
containers:
|
||||||
${containerConfig}
|
${containerConfig}
|
||||||
- name: jnlp
|
- name: jnlp
|
||||||
@ -92,6 +163,12 @@ def createKubernetesPodConfig(type, arch = "amd64")
|
|||||||
cpu: '2'
|
cpu: '2'
|
||||||
memory: 10Gi
|
memory: 10Gi
|
||||||
ephemeral-storage: 25Gi
|
ephemeral-storage: 25Gi
|
||||||
|
volumeMounts:
|
||||||
|
- name: sw-tensorrt-pvc
|
||||||
|
mountPath: "/mnt/sw-tensorrt-pvc"
|
||||||
|
readOnly: false
|
||||||
|
volumes:
|
||||||
|
${pvcVolume}
|
||||||
""".stripIndent(),
|
""".stripIndent(),
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -99,94 +176,218 @@ def createKubernetesPodConfig(type, arch = "amd64")
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def buildImage(target, action="build", torchInstallType="skip", args="", custom_tag="", post_tag="", is_sbsa=false)
|
def buildImage(config, imageKeyToTag)
|
||||||
{
|
{
|
||||||
def arch = is_sbsa ? "sbsa" : "x86_64"
|
def target = config.target
|
||||||
def tag = "${arch}-${target}-torch_${torchInstallType}${post_tag}-${LLM_BRANCH_TAG}-${BUILD_NUMBER}"
|
def action = config.action
|
||||||
|
def torchInstallType = config.torchInstallType
|
||||||
|
def args = config.args ?: ""
|
||||||
|
def customTag = config.customTag
|
||||||
|
def postTag = config.postTag
|
||||||
|
def dependentTarget = config.dependentTarget
|
||||||
|
def arch = config.arch == 'arm64' ? 'sbsa' : 'x86_64'
|
||||||
|
|
||||||
// Step 1: cloning tekit source code
|
def tag = "${arch}-${target}-torch_${torchInstallType}${postTag}-${LLM_DEFAULT_TAG}"
|
||||||
// allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
|
|
||||||
trtllm_utils.checkoutSource(LLM_REPO, LLM_BRANCH, LLM_ROOT, true, true)
|
|
||||||
|
|
||||||
// Step 2: building wheels in container
|
def dependentTargetTag = tag.replace("${arch}-${target}-", "${arch}-${dependentTarget}-")
|
||||||
container("docker") {
|
|
||||||
stage ("Install packages") {
|
|
||||||
sh "pwd && ls -alh"
|
|
||||||
sh "env"
|
|
||||||
sh "apk add make git"
|
|
||||||
sh "git config --global --add safe.directory '*'"
|
|
||||||
|
|
||||||
withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
if (target == "ngc-release") {
|
||||||
sh "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}"
|
imageKeyToTag["NGC Devel Image ${config.arch}"] = "${IMAGE_NAME}/${dependentTarget}:${dependentTargetTag}"
|
||||||
}
|
imageKeyToTag["NGC Release Image ${config.arch}"] = "${IMAGE_NAME}/${target}:${tag}"
|
||||||
|
}
|
||||||
|
|
||||||
withCredentials([
|
args += " GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote"
|
||||||
usernamePassword(
|
|
||||||
credentialsId: "svc_tensorrt_gitlab_read_api_token",
|
stage (config.stageName) {
|
||||||
usernameVariable: 'USERNAME',
|
// Step 1: Clone TRT-LLM source codes
|
||||||
passwordVariable: 'PASSWORD'
|
// If using a forked repo, svc_tensorrt needs to have the access to the forked repo.
|
||||||
),
|
trtllm_utils.checkoutSource(LLM_REPO, LLM_COMMIT_OR_BRANCH, LLM_ROOT, true, true)
|
||||||
string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
|
}
|
||||||
]) {
|
|
||||||
sh "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}"
|
// Step 2: Build the images
|
||||||
}
|
stage ("Install packages") {
|
||||||
|
sh "pwd && ls -alh"
|
||||||
|
sh "env"
|
||||||
|
sh "apk add make git"
|
||||||
|
sh "git config --global --add safe.directory '*'"
|
||||||
|
|
||||||
|
withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||||
|
sh "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}"
|
||||||
}
|
}
|
||||||
try {
|
|
||||||
// Fix the build OOM issue of release builds
|
|
||||||
def build_jobs = BUILD_JOBS
|
|
||||||
if (target == "trtllm") {
|
|
||||||
if (arch == "x86_64") {
|
|
||||||
build_jobs = BUILD_JOBS_RELEASE_X86_64
|
|
||||||
} else {
|
|
||||||
build_jobs = BUILD_JOBS_RELEASE_SBSA
|
|
||||||
}
|
|
||||||
}
|
|
||||||
containerGenFailure = null
|
|
||||||
stage ("make ${target}_${action}") {
|
|
||||||
retry(3)
|
|
||||||
{
|
|
||||||
// Fix the triton image pull timeout issue
|
|
||||||
def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
|
|
||||||
def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
|
|
||||||
retry(3) {
|
|
||||||
sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}"
|
|
||||||
}
|
|
||||||
|
|
||||||
sh """
|
withCredentials([
|
||||||
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
usernamePassword(
|
||||||
TORCH_INSTALL_TYPE=${torchInstallType} \
|
credentialsId: "svc_tensorrt_gitlab_read_api_token",
|
||||||
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${tag} \
|
usernameVariable: 'USERNAME',
|
||||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} \
|
passwordVariable: 'PASSWORD'
|
||||||
GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote
|
),
|
||||||
"""
|
string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
|
||||||
}
|
]) {
|
||||||
}
|
sh "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}"
|
||||||
|
|
||||||
if (custom_tag) {
|
|
||||||
stage ("custom tag: ${custom_tag}") {
|
|
||||||
sh """
|
|
||||||
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
|
||||||
TORCH_INSTALL_TYPE=${torchInstallType} \
|
|
||||||
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${custom_tag} \
|
|
||||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} \
|
|
||||||
GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote
|
|
||||||
"""
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception ex) {
|
|
||||||
containerGenFailure = ex
|
|
||||||
} finally {
|
|
||||||
stage ("Docker logout") {
|
|
||||||
withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) {
|
|
||||||
sh "docker logout urm.nvidia.com"
|
|
||||||
sh "docker logout ${DEFAULT_GIT_URL}:5005"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (containerGenFailure != null) {
|
|
||||||
throw containerGenFailure
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
try {
|
||||||
|
def build_jobs = BUILD_JOBS
|
||||||
|
// Fix the triton image pull timeout issue
|
||||||
|
def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
|
||||||
|
def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
|
||||||
|
containerGenFailure = null
|
||||||
|
|
||||||
|
if (dependentTarget) {
|
||||||
|
stage ("make ${dependentTarget}_${action} (${arch})") {
|
||||||
|
retry(3) {
|
||||||
|
retry(3) {
|
||||||
|
sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}"
|
||||||
|
}
|
||||||
|
sh """
|
||||||
|
cd ${LLM_ROOT} && make -C docker ${dependentTarget}_${action} \
|
||||||
|
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||||
|
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${dependentTargetTag} \
|
||||||
|
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
args += " DEVEL_IMAGE=${IMAGE_NAME}/${dependentTarget}:${dependentTargetTag}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Avoid the frequency of OOM issue when building the wheel
|
||||||
|
if (target == "trtllm") {
|
||||||
|
if (arch == "x86_64") {
|
||||||
|
build_jobs = BUILD_JOBS_RELEASE_X86_64
|
||||||
|
} else {
|
||||||
|
build_jobs = BUILD_JOBS_RELEASE_SBSA
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage ("make ${target}_${action} (${arch})") {
|
||||||
|
retry(3) {
|
||||||
|
retry(3) {
|
||||||
|
sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}"
|
||||||
|
}
|
||||||
|
|
||||||
|
sh """
|
||||||
|
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
||||||
|
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||||
|
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${tag} \
|
||||||
|
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (customTag) {
|
||||||
|
stage ("custom tag: ${customTag} (${arch})") {
|
||||||
|
sh """
|
||||||
|
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
||||||
|
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||||
|
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${customTag} \
|
||||||
|
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
containerGenFailure = ex
|
||||||
|
} finally {
|
||||||
|
stage ("Docker logout") {
|
||||||
|
withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) {
|
||||||
|
sh "docker logout urm.nvidia.com"
|
||||||
|
sh "docker logout ${DEFAULT_GIT_URL}:5005"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (containerGenFailure != null) {
|
||||||
|
throw containerGenFailure
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
|
||||||
|
def defaultBuildConfig = [
|
||||||
|
target: "tritondevel",
|
||||||
|
action: params.action,
|
||||||
|
customTag: "",
|
||||||
|
postTag: "",
|
||||||
|
args: "",
|
||||||
|
torchInstallType: "skip",
|
||||||
|
arch: "amd64",
|
||||||
|
build_wheel: false,
|
||||||
|
dependentTarget: "",
|
||||||
|
]
|
||||||
|
|
||||||
|
def release_action = env.JOB_NAME ==~ /.*PostMerge.*/ ? "push" : params.action
|
||||||
|
def buildConfigs = [
|
||||||
|
"Build trtllm release (x86_64)": [
|
||||||
|
target: "trtllm",
|
||||||
|
action: release_action,
|
||||||
|
customTag: LLM_BRANCH_TAG + "-x86_64",
|
||||||
|
build_wheel: true,
|
||||||
|
],
|
||||||
|
"Build trtllm release (SBSA)": [
|
||||||
|
target: "trtllm",
|
||||||
|
action: release_action,
|
||||||
|
customTag: LLM_BRANCH_TAG + "-sbsa",
|
||||||
|
build_wheel: true,
|
||||||
|
arch: "arm64"
|
||||||
|
],
|
||||||
|
"Build CI image (x86_64 tritondevel)": [:],
|
||||||
|
"Build CI image (SBSA tritondevel)": [
|
||||||
|
arch: "arm64",
|
||||||
|
],
|
||||||
|
"Build CI image (RockyLinux8 Python310)": [
|
||||||
|
target: "rockylinux8",
|
||||||
|
args: "PYTHON_VERSION=3.10.12",
|
||||||
|
postTag: "-py310",
|
||||||
|
],
|
||||||
|
"Build CI image(RockyLinux8 Python312)": [
|
||||||
|
target: "rockylinux8",
|
||||||
|
args: "PYTHON_VERSION=3.12.3 STAGE=tritondevel",
|
||||||
|
postTag: "-py312",
|
||||||
|
],
|
||||||
|
"Build NGC devel and release (x86_64)": [
|
||||||
|
target: "ngc-release",
|
||||||
|
action: release_action,
|
||||||
|
customTag: "ngc-" + LLM_BRANCH_TAG + "-x86_64",
|
||||||
|
args: "DOCKER_BUILD_OPTS='--load --platform linux/amd64'",
|
||||||
|
build_wheel: true,
|
||||||
|
dependentTarget: "devel",
|
||||||
|
],
|
||||||
|
"Build NGC devel and release(SBSA)": [
|
||||||
|
target: "ngc-release",
|
||||||
|
action: release_action,
|
||||||
|
customTag: "ngc-" + LLM_BRANCH_TAG + "-sbsa",
|
||||||
|
args: "DOCKER_BUILD_OPTS='--load --platform linux/arm64'",
|
||||||
|
arch: "arm64",
|
||||||
|
build_wheel: true,
|
||||||
|
dependentTarget: "devel",
|
||||||
|
],
|
||||||
|
]
|
||||||
|
// Override all fields in build config with default values
|
||||||
|
buildConfigs.each { key, config ->
|
||||||
|
defaultBuildConfig.each { defaultKey, defaultValue ->
|
||||||
|
if (!(defaultKey in config)) {
|
||||||
|
config[defaultKey] = defaultValue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
config.podConfig = createKubernetesPodConfig("build", config.arch, config.build_wheel)
|
||||||
|
}
|
||||||
|
echo "Build configs:"
|
||||||
|
println buildConfigs
|
||||||
|
|
||||||
|
def buildJobs = buildConfigs.collectEntries { key, config ->
|
||||||
|
[key, {
|
||||||
|
script {
|
||||||
|
stage(key) {
|
||||||
|
config.stageName = key
|
||||||
|
trtllm_utils.launchKubernetesPod(pipeline, config.podConfig, "docker") {
|
||||||
|
buildImage(config, imageKeyToTag)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "enableFailFast is: ${env.enableFailFast}, but we currently don't use it due to random ucxx issue"
|
||||||
|
//pipeline.failFast = env.enableFailFast
|
||||||
|
pipeline.parallel buildJobs
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -216,65 +417,40 @@ pipeline {
|
|||||||
timeout(time: 24, unit: 'HOURS')
|
timeout(time: 24, unit: 'HOURS')
|
||||||
}
|
}
|
||||||
environment {
|
environment {
|
||||||
|
CCACHE_DIR="${CCACHE_DIR}"
|
||||||
PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
|
PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
|
||||||
}
|
}
|
||||||
stages {
|
stages {
|
||||||
stage("Build")
|
stage("Setup environment") {
|
||||||
{
|
steps {
|
||||||
parallel {
|
script {
|
||||||
stage("Build trtllm release") {
|
echo "branch is: ${LLM_BRANCH}"
|
||||||
agent {
|
echo "env.gitlabBranch is: ${env.gitlabBranch}"
|
||||||
kubernetes createKubernetesPodConfig("build")
|
echo "params.branch is: ${params.branch}"
|
||||||
}
|
echo "params.action is: ${params.action}"
|
||||||
steps
|
echo "env.defaultTag is: ${env.defaultTag}"
|
||||||
{
|
echo "env.gitlabCommit is: ${env.gitlabCommit}"
|
||||||
buildImage("trtllm", env.JOB_NAME ==~ /.*PostMerge.*/ ? "push" : params.action, "skip", "", LLM_BRANCH_TAG)
|
echo "LLM_REPO is: ${LLM_REPO}"
|
||||||
}
|
echo "env.globalVars is: ${env.globalVars}"
|
||||||
|
globalVars = trtllm_utils.updateMapWithJson(this, globalVars, env.globalVars, "globalVars")
|
||||||
|
globalVars[ACTION_INFO] = trtllm_utils.setupPipelineDescription(this, globalVars[ACTION_INFO])
|
||||||
}
|
}
|
||||||
stage("Build x86_64-skip") {
|
}
|
||||||
agent {
|
}
|
||||||
kubernetes createKubernetesPodConfig("build")
|
stage("Build") {
|
||||||
}
|
steps{
|
||||||
steps
|
script{
|
||||||
{
|
launchBuildJobs(this, globalVars, imageKeyToTag)
|
||||||
buildImage("tritondevel", params.action, "skip")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
stage("Build trtllm release-sbsa") {
|
}
|
||||||
agent {
|
}
|
||||||
kubernetes createKubernetesPodConfig("build", "arm64")
|
stage("Upload Artifacts") {
|
||||||
}
|
steps {
|
||||||
steps
|
script {
|
||||||
{
|
String imageKeyToTagJson = writeJSON returnText: true, json: imageKeyToTag
|
||||||
buildImage("trtllm", env.JOB_NAME ==~ /.*PostMerge.*/ ? "push" : params.action, "skip", "", LLM_BRANCH_TAG + "-sbsa", "", true)
|
echo "imageKeyToTag is: ${imageKeyToTagJson}"
|
||||||
}
|
writeFile file: "imageKeyToTag.json", text: imageKeyToTagJson
|
||||||
}
|
archiveArtifacts artifacts: 'imageKeyToTag.json', fingerprint: true
|
||||||
stage("Build rockylinux8 x86_64-skip-py3.10") {
|
|
||||||
agent {
|
|
||||||
kubernetes createKubernetesPodConfig("build")
|
|
||||||
}
|
|
||||||
steps
|
|
||||||
{
|
|
||||||
buildImage("rockylinux8", params.action, "skip", "PYTHON_VERSION=3.10.12 STAGE=tritondevel", "", "-py310")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage("Build rockylinux8 x86_64-skip-py3.12") {
|
|
||||||
agent {
|
|
||||||
kubernetes createKubernetesPodConfig("build")
|
|
||||||
}
|
|
||||||
steps
|
|
||||||
{
|
|
||||||
buildImage("rockylinux8", params.action, "skip", "PYTHON_VERSION=3.12.3 STAGE=tritondevel", "", "-py312")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage("Build SBSA-skip") {
|
|
||||||
agent {
|
|
||||||
kubernetes createKubernetesPodConfig("build", "arm64")
|
|
||||||
}
|
|
||||||
steps
|
|
||||||
{
|
|
||||||
buildImage("tritondevel", params.action, "skip", "", "", "", true)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1035,6 +1035,43 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
def dockerBuildJob = [
|
||||||
|
"Build-Docker-Images": {
|
||||||
|
script {
|
||||||
|
stage("[Build-Docker-Images] Remote Run") {
|
||||||
|
def parameters = getCommonParameters()
|
||||||
|
String globalVarsJson = writeJSON returnText: true, json: globalVars
|
||||||
|
def branch = env.gitlabBranch ? env.gitlabBranch : "main"
|
||||||
|
if (globalVars[GITHUB_PR_API_URL]) {
|
||||||
|
branch = "github-pr-" + globalVars[GITHUB_PR_API_URL].split('/').last()
|
||||||
|
}
|
||||||
|
|
||||||
|
parameters += [
|
||||||
|
'enableFailFast': enableFailFast,
|
||||||
|
'branch': branch,
|
||||||
|
'action': "push",
|
||||||
|
'globalVars': globalVarsJson,
|
||||||
|
]
|
||||||
|
|
||||||
|
echo "trigger BuildDockerImages job, params: ${parameters}"
|
||||||
|
|
||||||
|
def status = triggerJob("/LLM/helpers/BuildDockerImages", parameters)
|
||||||
|
if (status != "SUCCESS") {
|
||||||
|
error "Downstream job did not succeed"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
if (env.JOB_NAME ==~ /.*PostMerge.*/) {
|
||||||
|
stages += dockerBuildJob
|
||||||
|
}
|
||||||
|
if (testFilter[(TEST_STAGE_LIST)]?.contains("Build-Docker-Images") || testFilter[(EXTRA_STAGE_LIST)]?.contains("Build-Docker-Images")) {
|
||||||
|
stages += dockerBuildJob
|
||||||
|
testFilter[(TEST_STAGE_LIST)]?.remove("Build-Docker-Images")
|
||||||
|
testFilter[(EXTRA_STAGE_LIST)]?.remove("Build-Docker-Images")
|
||||||
|
echo "Will run Build-Docker-Images job"
|
||||||
|
}
|
||||||
|
|
||||||
parallelJobs = stages.collectEntries{key, value -> [key, {
|
parallelJobs = stages.collectEntries{key, value -> [key, {
|
||||||
script {
|
script {
|
||||||
|
|||||||
15
jenkins/docker/Dockerfile.dind
Normal file
15
jenkins/docker/Dockerfile.dind
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
# docker buildx build -t tensorrt-llm:{timestamp}_docker_dind_withbash -f jenkins/docker/Dockerfile.dind . --builder=multi-builder --platform linux/arm64,linux/amd64
|
||||||
|
|
||||||
|
FROM docker:dind
|
||||||
|
|
||||||
|
RUN apk add --no-cache bash git make python3 py3-pip
|
||||||
|
|
||||||
|
ENV PATH=/usr/local/cmake/bin:$PATH
|
||||||
|
ENV ENV=${ENV:-/etc/shinit_v2}
|
||||||
|
COPY docker/common/install_cmake.sh install_cmake.sh
|
||||||
|
RUN bash ./install_cmake.sh && rm install_cmake.sh
|
||||||
|
|
||||||
|
RUN git clone https://github.com/rapidsai/rapids-cmake.git /tmp/rapids-cmake && \
|
||||||
|
mkdir -p /usr/local/share/cmake/rapids && \
|
||||||
|
cp -r /tmp/rapids-cmake/rapids-cmake/* /usr/local/share/cmake/rapids/ && \
|
||||||
|
rm -rf /tmp/rapids-cmake
|
||||||
Loading…
Reference in New Issue
Block a user