mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
infra: [TRTLLM-5247][TRTLLM-5248][TRTLLM-5249] Refactor docker build image groovy and support NGC images (#4294)
Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
058f83e47b
commit
7b2b657198
@ -117,7 +117,7 @@ RUN mkdir -p /root/.cache/pip /root/.cache/ccache
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
# Build the TRT-LLM wheel
|
||||
ARG BUILD_WHEEL_ARGS="--clean --python_bindings --benchmarks"
|
||||
RUN --mount=type=cache,target=/root/.cache/pip --mount=type=cache,target=/root/.cache/ccache \
|
||||
RUN --mount=type=cache,target=/root/.cache/pip --mount=type=cache,target=${CCACHE_DIR} \
|
||||
python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS}
|
||||
|
||||
FROM ${DEVEL_IMAGE} AS release
|
||||
|
||||
@ -161,24 +161,24 @@ release_run: WORK_DIR = /app/tensorrt_llm
|
||||
|
||||
# For x86_64
|
||||
jenkins_%: IMAGE_WITH_TAG = $(shell grep 'LLM_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
|
||||
jenkins_%: STAGE = devel
|
||||
jenkins_%: STAGE = tritondevel
|
||||
|
||||
# For aarch64
|
||||
jenkins-aarch64_%: IMAGE_WITH_TAG = $(shell grep 'LLM_SBSA_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
|
||||
jenkins-aarch64_%: STAGE = devel
|
||||
jenkins-aarch64_%: STAGE = tritondevel
|
||||
|
||||
# For x86_64
|
||||
jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell grep 'LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
|
||||
jenkins-rockylinux8_%: STAGE = devel
|
||||
jenkins-rockylinux8_%: STAGE = tritondevel
|
||||
jenkins-rockylinux8_%: BASE_IMAGE = nvidia/cuda
|
||||
jenkins-rockylinux8_%: BASE_TAG = 12.9.0-devel-rockylinux8
|
||||
|
||||
rockylinux8_%: STAGE = devel
|
||||
rockylinux8_%: STAGE = tritondevel
|
||||
rockylinux8_%: BASE_IMAGE = nvidia/cuda
|
||||
rockylinux8_%: BASE_TAG = 12.9.0-devel-rockylinux8
|
||||
|
||||
# For x86_64 and aarch64
|
||||
ubuntu22_%: STAGE = devel
|
||||
ubuntu22_%: STAGE = tritondevel
|
||||
ubuntu22_%: BASE_IMAGE = nvidia/cuda
|
||||
ubuntu22_%: BASE_TAG = 12.9.0-devel-ubuntu22.04
|
||||
|
||||
|
||||
@ -12,17 +12,72 @@ withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LL
|
||||
}
|
||||
LLM_ROOT = "llm"
|
||||
|
||||
LLM_BRANCH = env.gitlabBranch? env.gitlabBranch : params.branch
|
||||
LLM_BRANCH = env.gitlabBranch ?: params.branch
|
||||
LLM_BRANCH_TAG = LLM_BRANCH.replaceAll('/', '_')
|
||||
|
||||
BUILD_JOBS = "32"
|
||||
BUILD_JOBS_RELEASE_X86_64 = "16"
|
||||
BUILD_JOBS_RELEASE_SBSA = "8"
|
||||
LLM_COMMIT_OR_BRANCH = env.gitlabCommit ?: LLM_BRANCH
|
||||
|
||||
def createKubernetesPodConfig(type, arch = "amd64")
|
||||
LLM_SHORT_COMMIT = env.gitlabCommit ? env.gitlabCommit.substring(0, 7) : "undefined"
|
||||
|
||||
LLM_DEFAULT_TAG = env.defaultTag ?: "${LLM_SHORT_COMMIT}-${LLM_BRANCH_TAG}-${BUILD_NUMBER}"
|
||||
|
||||
BUILD_JOBS = "32"
|
||||
BUILD_JOBS_RELEASE_X86_64 = "32"
|
||||
BUILD_JOBS_RELEASE_SBSA = "32"
|
||||
|
||||
CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
|
||||
|
||||
@Field
|
||||
def GITHUB_PR_API_URL = "github_pr_api_url"
|
||||
@Field
|
||||
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
|
||||
@Field
|
||||
def ACTION_INFO = "action_info"
|
||||
def globalVars = [
|
||||
(GITHUB_PR_API_URL): null,
|
||||
(CACHED_CHANGED_FILE_LIST): null,
|
||||
(ACTION_INFO): null,
|
||||
]
|
||||
|
||||
@Field
|
||||
def imageKeyToTag = [:]
|
||||
|
||||
def createKubernetesPodConfig(type, arch = "amd64", build_wheel = false)
|
||||
{
|
||||
def targetCould = "kubernetes-cpu"
|
||||
def containerConfig = ""
|
||||
def selectors = """
|
||||
nodeSelector:
|
||||
nvidia.com/node_type: builder
|
||||
kubernetes.io/os: linux
|
||||
kubernetes.io/arch: ${arch}"""
|
||||
|
||||
if (build_wheel && arch == "arm64") {
|
||||
// For aarch64, we need to use hostname to fix the ucxx issue when building wheels
|
||||
selectors += """
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: "kubernetes.io/hostname"
|
||||
operator: In
|
||||
values:
|
||||
- "rl300-0008.ipp2u1.colossus"
|
||||
- "rl300-0014.ipp2u1.colossus"
|
||||
- "rl300-0023.ipp2u1.colossus"
|
||||
- "rl300-0024.ipp2u1.colossus"
|
||||
- "rl300-0030.ipp2u1.colossus"
|
||||
- "rl300-0040.ipp2u1.colossus"
|
||||
- "rl300-0041.ipp2u1.colossus"
|
||||
- "rl300-0042.ipp2u1.colossus"
|
||||
- "rl300-0043.ipp2u1.colossus"
|
||||
- "rl300-0044.ipp2u1.colossus"
|
||||
- "rl300-0045.ipp2u1.colossus"
|
||||
- "rl300-0046.ipp2u1.colossus"
|
||||
- "rl300-0047.ipp2u1.colossus"
|
||||
"""
|
||||
}
|
||||
|
||||
switch(type)
|
||||
{
|
||||
@ -44,9 +99,10 @@ def createKubernetesPodConfig(type, arch = "amd64")
|
||||
imagePullPolicy: Always"""
|
||||
break
|
||||
case "build":
|
||||
// Use a customized docker:dind image with essential dependencies
|
||||
containerConfig = """
|
||||
- name: docker
|
||||
image: urm.nvidia.com/docker/docker:dind
|
||||
image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:202505221445_docker_dind_withbash
|
||||
tty: true
|
||||
resources:
|
||||
requests:
|
||||
@ -65,19 +121,34 @@ def createKubernetesPodConfig(type, arch = "amd64")
|
||||
- SYS_ADMIN"""
|
||||
break
|
||||
}
|
||||
|
||||
def pvcVolume = """
|
||||
- name: sw-tensorrt-pvc
|
||||
persistentVolumeClaim:
|
||||
claimName: sw-tensorrt-pvc
|
||||
"""
|
||||
if (arch == "arm64") {
|
||||
// PVC mount isn't supported on aarch64 platform. Use NFS as a WAR.
|
||||
pvcVolume = """
|
||||
- name: sw-tensorrt-pvc
|
||||
nfs:
|
||||
server: 10.117.145.13
|
||||
path: /vol/scratch1/scratch.svc_tensorrt_blossom
|
||||
"""
|
||||
}
|
||||
def nodeLabelPrefix = "cpu"
|
||||
def jobName = "llm-build-images"
|
||||
def buildID = env.BUILD_ID
|
||||
def nodeLabel = trtllm_utils.appendRandomPostfix("${nodeLabelPrefix}---tensorrt-${jobName}-${buildID}")
|
||||
def podConfig = [
|
||||
cloud: targetCould,
|
||||
namespace: "sw-tensorrt",
|
||||
label: nodeLabel,
|
||||
yaml: """
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
spec:
|
||||
qosClass: Guaranteed
|
||||
nodeSelector:
|
||||
nvidia.com/node_type: builder
|
||||
kubernetes.io/os: linux
|
||||
kubernetes.io/arch: ${arch}
|
||||
${selectors}
|
||||
containers:
|
||||
${containerConfig}
|
||||
- name: jnlp
|
||||
@ -92,6 +163,12 @@ def createKubernetesPodConfig(type, arch = "amd64")
|
||||
cpu: '2'
|
||||
memory: 10Gi
|
||||
ephemeral-storage: 25Gi
|
||||
volumeMounts:
|
||||
- name: sw-tensorrt-pvc
|
||||
mountPath: "/mnt/sw-tensorrt-pvc"
|
||||
readOnly: false
|
||||
volumes:
|
||||
${pvcVolume}
|
||||
""".stripIndent(),
|
||||
]
|
||||
|
||||
@ -99,94 +176,218 @@ def createKubernetesPodConfig(type, arch = "amd64")
|
||||
}
|
||||
|
||||
|
||||
def buildImage(target, action="build", torchInstallType="skip", args="", custom_tag="", post_tag="", is_sbsa=false)
|
||||
def buildImage(config, imageKeyToTag)
|
||||
{
|
||||
def arch = is_sbsa ? "sbsa" : "x86_64"
|
||||
def tag = "${arch}-${target}-torch_${torchInstallType}${post_tag}-${LLM_BRANCH_TAG}-${BUILD_NUMBER}"
|
||||
def target = config.target
|
||||
def action = config.action
|
||||
def torchInstallType = config.torchInstallType
|
||||
def args = config.args ?: ""
|
||||
def customTag = config.customTag
|
||||
def postTag = config.postTag
|
||||
def dependentTarget = config.dependentTarget
|
||||
def arch = config.arch == 'arm64' ? 'sbsa' : 'x86_64'
|
||||
|
||||
// Step 1: cloning tekit source code
|
||||
// allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
|
||||
trtllm_utils.checkoutSource(LLM_REPO, LLM_BRANCH, LLM_ROOT, true, true)
|
||||
def tag = "${arch}-${target}-torch_${torchInstallType}${postTag}-${LLM_DEFAULT_TAG}"
|
||||
|
||||
// Step 2: building wheels in container
|
||||
container("docker") {
|
||||
stage ("Install packages") {
|
||||
sh "pwd && ls -alh"
|
||||
sh "env"
|
||||
sh "apk add make git"
|
||||
sh "git config --global --add safe.directory '*'"
|
||||
def dependentTargetTag = tag.replace("${arch}-${target}-", "${arch}-${dependentTarget}-")
|
||||
|
||||
withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
sh "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}"
|
||||
}
|
||||
if (target == "ngc-release") {
|
||||
imageKeyToTag["NGC Devel Image ${config.arch}"] = "${IMAGE_NAME}/${dependentTarget}:${dependentTargetTag}"
|
||||
imageKeyToTag["NGC Release Image ${config.arch}"] = "${IMAGE_NAME}/${target}:${tag}"
|
||||
}
|
||||
|
||||
withCredentials([
|
||||
usernamePassword(
|
||||
credentialsId: "svc_tensorrt_gitlab_read_api_token",
|
||||
usernameVariable: 'USERNAME',
|
||||
passwordVariable: 'PASSWORD'
|
||||
),
|
||||
string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
|
||||
]) {
|
||||
sh "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}"
|
||||
}
|
||||
args += " GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote"
|
||||
|
||||
stage (config.stageName) {
|
||||
// Step 1: Clone TRT-LLM source codes
|
||||
// If using a forked repo, svc_tensorrt needs to have the access to the forked repo.
|
||||
trtllm_utils.checkoutSource(LLM_REPO, LLM_COMMIT_OR_BRANCH, LLM_ROOT, true, true)
|
||||
}
|
||||
|
||||
// Step 2: Build the images
|
||||
stage ("Install packages") {
|
||||
sh "pwd && ls -alh"
|
||||
sh "env"
|
||||
sh "apk add make git"
|
||||
sh "git config --global --add safe.directory '*'"
|
||||
|
||||
withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
sh "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}"
|
||||
}
|
||||
try {
|
||||
// Fix the build OOM issue of release builds
|
||||
def build_jobs = BUILD_JOBS
|
||||
if (target == "trtllm") {
|
||||
if (arch == "x86_64") {
|
||||
build_jobs = BUILD_JOBS_RELEASE_X86_64
|
||||
} else {
|
||||
build_jobs = BUILD_JOBS_RELEASE_SBSA
|
||||
}
|
||||
}
|
||||
containerGenFailure = null
|
||||
stage ("make ${target}_${action}") {
|
||||
retry(3)
|
||||
{
|
||||
// Fix the triton image pull timeout issue
|
||||
def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
|
||||
def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
|
||||
retry(3) {
|
||||
sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}"
|
||||
}
|
||||
|
||||
sh """
|
||||
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
||||
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${tag} \
|
||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} \
|
||||
GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote
|
||||
"""
|
||||
}
|
||||
}
|
||||
|
||||
if (custom_tag) {
|
||||
stage ("custom tag: ${custom_tag}") {
|
||||
sh """
|
||||
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
||||
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${custom_tag} \
|
||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} \
|
||||
GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote
|
||||
"""
|
||||
}
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
containerGenFailure = ex
|
||||
} finally {
|
||||
stage ("Docker logout") {
|
||||
withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) {
|
||||
sh "docker logout urm.nvidia.com"
|
||||
sh "docker logout ${DEFAULT_GIT_URL}:5005"
|
||||
}
|
||||
}
|
||||
if (containerGenFailure != null) {
|
||||
throw containerGenFailure
|
||||
}
|
||||
withCredentials([
|
||||
usernamePassword(
|
||||
credentialsId: "svc_tensorrt_gitlab_read_api_token",
|
||||
usernameVariable: 'USERNAME',
|
||||
passwordVariable: 'PASSWORD'
|
||||
),
|
||||
string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
|
||||
]) {
|
||||
sh "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}"
|
||||
}
|
||||
}
|
||||
try {
|
||||
def build_jobs = BUILD_JOBS
|
||||
// Fix the triton image pull timeout issue
|
||||
def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
|
||||
def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
|
||||
containerGenFailure = null
|
||||
|
||||
if (dependentTarget) {
|
||||
stage ("make ${dependentTarget}_${action} (${arch})") {
|
||||
retry(3) {
|
||||
retry(3) {
|
||||
sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}"
|
||||
}
|
||||
sh """
|
||||
cd ${LLM_ROOT} && make -C docker ${dependentTarget}_${action} \
|
||||
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${dependentTargetTag} \
|
||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
|
||||
"""
|
||||
}
|
||||
args += " DEVEL_IMAGE=${IMAGE_NAME}/${dependentTarget}:${dependentTargetTag}"
|
||||
}
|
||||
}
|
||||
|
||||
// Avoid the frequency of OOM issue when building the wheel
|
||||
if (target == "trtllm") {
|
||||
if (arch == "x86_64") {
|
||||
build_jobs = BUILD_JOBS_RELEASE_X86_64
|
||||
} else {
|
||||
build_jobs = BUILD_JOBS_RELEASE_SBSA
|
||||
}
|
||||
}
|
||||
stage ("make ${target}_${action} (${arch})") {
|
||||
retry(3) {
|
||||
retry(3) {
|
||||
sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}"
|
||||
}
|
||||
|
||||
sh """
|
||||
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
||||
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${tag} \
|
||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
|
||||
"""
|
||||
}
|
||||
}
|
||||
|
||||
if (customTag) {
|
||||
stage ("custom tag: ${customTag} (${arch})") {
|
||||
sh """
|
||||
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
||||
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||
IMAGE_NAME=${IMAGE_NAME} IMAGE_TAG=${customTag} \
|
||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
|
||||
"""
|
||||
}
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
containerGenFailure = ex
|
||||
} finally {
|
||||
stage ("Docker logout") {
|
||||
withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) {
|
||||
sh "docker logout urm.nvidia.com"
|
||||
sh "docker logout ${DEFAULT_GIT_URL}:5005"
|
||||
}
|
||||
}
|
||||
if (containerGenFailure != null) {
|
||||
throw containerGenFailure
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
|
||||
def defaultBuildConfig = [
|
||||
target: "tritondevel",
|
||||
action: params.action,
|
||||
customTag: "",
|
||||
postTag: "",
|
||||
args: "",
|
||||
torchInstallType: "skip",
|
||||
arch: "amd64",
|
||||
build_wheel: false,
|
||||
dependentTarget: "",
|
||||
]
|
||||
|
||||
def release_action = env.JOB_NAME ==~ /.*PostMerge.*/ ? "push" : params.action
|
||||
def buildConfigs = [
|
||||
"Build trtllm release (x86_64)": [
|
||||
target: "trtllm",
|
||||
action: release_action,
|
||||
customTag: LLM_BRANCH_TAG + "-x86_64",
|
||||
build_wheel: true,
|
||||
],
|
||||
"Build trtllm release (SBSA)": [
|
||||
target: "trtllm",
|
||||
action: release_action,
|
||||
customTag: LLM_BRANCH_TAG + "-sbsa",
|
||||
build_wheel: true,
|
||||
arch: "arm64"
|
||||
],
|
||||
"Build CI image (x86_64 tritondevel)": [:],
|
||||
"Build CI image (SBSA tritondevel)": [
|
||||
arch: "arm64",
|
||||
],
|
||||
"Build CI image (RockyLinux8 Python310)": [
|
||||
target: "rockylinux8",
|
||||
args: "PYTHON_VERSION=3.10.12",
|
||||
postTag: "-py310",
|
||||
],
|
||||
"Build CI image(RockyLinux8 Python312)": [
|
||||
target: "rockylinux8",
|
||||
args: "PYTHON_VERSION=3.12.3 STAGE=tritondevel",
|
||||
postTag: "-py312",
|
||||
],
|
||||
"Build NGC devel and release (x86_64)": [
|
||||
target: "ngc-release",
|
||||
action: release_action,
|
||||
customTag: "ngc-" + LLM_BRANCH_TAG + "-x86_64",
|
||||
args: "DOCKER_BUILD_OPTS='--load --platform linux/amd64'",
|
||||
build_wheel: true,
|
||||
dependentTarget: "devel",
|
||||
],
|
||||
"Build NGC devel and release(SBSA)": [
|
||||
target: "ngc-release",
|
||||
action: release_action,
|
||||
customTag: "ngc-" + LLM_BRANCH_TAG + "-sbsa",
|
||||
args: "DOCKER_BUILD_OPTS='--load --platform linux/arm64'",
|
||||
arch: "arm64",
|
||||
build_wheel: true,
|
||||
dependentTarget: "devel",
|
||||
],
|
||||
]
|
||||
// Override all fields in build config with default values
|
||||
buildConfigs.each { key, config ->
|
||||
defaultBuildConfig.each { defaultKey, defaultValue ->
|
||||
if (!(defaultKey in config)) {
|
||||
config[defaultKey] = defaultValue
|
||||
}
|
||||
}
|
||||
config.podConfig = createKubernetesPodConfig("build", config.arch, config.build_wheel)
|
||||
}
|
||||
echo "Build configs:"
|
||||
println buildConfigs
|
||||
|
||||
def buildJobs = buildConfigs.collectEntries { key, config ->
|
||||
[key, {
|
||||
script {
|
||||
stage(key) {
|
||||
config.stageName = key
|
||||
trtllm_utils.launchKubernetesPod(pipeline, config.podConfig, "docker") {
|
||||
buildImage(config, imageKeyToTag)
|
||||
}
|
||||
}
|
||||
}
|
||||
}]
|
||||
}
|
||||
|
||||
echo "enableFailFast is: ${env.enableFailFast}, but we currently don't use it due to random ucxx issue"
|
||||
//pipeline.failFast = env.enableFailFast
|
||||
pipeline.parallel buildJobs
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -216,65 +417,40 @@ pipeline {
|
||||
timeout(time: 24, unit: 'HOURS')
|
||||
}
|
||||
environment {
|
||||
CCACHE_DIR="${CCACHE_DIR}"
|
||||
PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
|
||||
}
|
||||
stages {
|
||||
stage("Build")
|
||||
{
|
||||
parallel {
|
||||
stage("Build trtllm release") {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("build")
|
||||
}
|
||||
steps
|
||||
{
|
||||
buildImage("trtllm", env.JOB_NAME ==~ /.*PostMerge.*/ ? "push" : params.action, "skip", "", LLM_BRANCH_TAG)
|
||||
}
|
||||
stage("Setup environment") {
|
||||
steps {
|
||||
script {
|
||||
echo "branch is: ${LLM_BRANCH}"
|
||||
echo "env.gitlabBranch is: ${env.gitlabBranch}"
|
||||
echo "params.branch is: ${params.branch}"
|
||||
echo "params.action is: ${params.action}"
|
||||
echo "env.defaultTag is: ${env.defaultTag}"
|
||||
echo "env.gitlabCommit is: ${env.gitlabCommit}"
|
||||
echo "LLM_REPO is: ${LLM_REPO}"
|
||||
echo "env.globalVars is: ${env.globalVars}"
|
||||
globalVars = trtllm_utils.updateMapWithJson(this, globalVars, env.globalVars, "globalVars")
|
||||
globalVars[ACTION_INFO] = trtllm_utils.setupPipelineDescription(this, globalVars[ACTION_INFO])
|
||||
}
|
||||
stage("Build x86_64-skip") {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("build")
|
||||
}
|
||||
steps
|
||||
{
|
||||
buildImage("tritondevel", params.action, "skip")
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Build") {
|
||||
steps{
|
||||
script{
|
||||
launchBuildJobs(this, globalVars, imageKeyToTag)
|
||||
}
|
||||
stage("Build trtllm release-sbsa") {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("build", "arm64")
|
||||
}
|
||||
steps
|
||||
{
|
||||
buildImage("trtllm", env.JOB_NAME ==~ /.*PostMerge.*/ ? "push" : params.action, "skip", "", LLM_BRANCH_TAG + "-sbsa", "", true)
|
||||
}
|
||||
}
|
||||
stage("Build rockylinux8 x86_64-skip-py3.10") {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("build")
|
||||
}
|
||||
steps
|
||||
{
|
||||
buildImage("rockylinux8", params.action, "skip", "PYTHON_VERSION=3.10.12 STAGE=tritondevel", "", "-py310")
|
||||
}
|
||||
}
|
||||
stage("Build rockylinux8 x86_64-skip-py3.12") {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("build")
|
||||
}
|
||||
steps
|
||||
{
|
||||
buildImage("rockylinux8", params.action, "skip", "PYTHON_VERSION=3.12.3 STAGE=tritondevel", "", "-py312")
|
||||
}
|
||||
}
|
||||
stage("Build SBSA-skip") {
|
||||
agent {
|
||||
kubernetes createKubernetesPodConfig("build", "arm64")
|
||||
}
|
||||
steps
|
||||
{
|
||||
buildImage("tritondevel", params.action, "skip", "", "", "", true)
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Upload Artifacts") {
|
||||
steps {
|
||||
script {
|
||||
String imageKeyToTagJson = writeJSON returnText: true, json: imageKeyToTag
|
||||
echo "imageKeyToTag is: ${imageKeyToTagJson}"
|
||||
writeFile file: "imageKeyToTag.json", text: imageKeyToTagJson
|
||||
archiveArtifacts artifacts: 'imageKeyToTag.json', fingerprint: true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1035,6 +1035,43 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
||||
}
|
||||
},
|
||||
]
|
||||
def dockerBuildJob = [
|
||||
"Build-Docker-Images": {
|
||||
script {
|
||||
stage("[Build-Docker-Images] Remote Run") {
|
||||
def parameters = getCommonParameters()
|
||||
String globalVarsJson = writeJSON returnText: true, json: globalVars
|
||||
def branch = env.gitlabBranch ? env.gitlabBranch : "main"
|
||||
if (globalVars[GITHUB_PR_API_URL]) {
|
||||
branch = "github-pr-" + globalVars[GITHUB_PR_API_URL].split('/').last()
|
||||
}
|
||||
|
||||
parameters += [
|
||||
'enableFailFast': enableFailFast,
|
||||
'branch': branch,
|
||||
'action': "push",
|
||||
'globalVars': globalVarsJson,
|
||||
]
|
||||
|
||||
echo "trigger BuildDockerImages job, params: ${parameters}"
|
||||
|
||||
def status = triggerJob("/LLM/helpers/BuildDockerImages", parameters)
|
||||
if (status != "SUCCESS") {
|
||||
error "Downstream job did not succeed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
if (env.JOB_NAME ==~ /.*PostMerge.*/) {
|
||||
stages += dockerBuildJob
|
||||
}
|
||||
if (testFilter[(TEST_STAGE_LIST)]?.contains("Build-Docker-Images") || testFilter[(EXTRA_STAGE_LIST)]?.contains("Build-Docker-Images")) {
|
||||
stages += dockerBuildJob
|
||||
testFilter[(TEST_STAGE_LIST)]?.remove("Build-Docker-Images")
|
||||
testFilter[(EXTRA_STAGE_LIST)]?.remove("Build-Docker-Images")
|
||||
echo "Will run Build-Docker-Images job"
|
||||
}
|
||||
|
||||
parallelJobs = stages.collectEntries{key, value -> [key, {
|
||||
script {
|
||||
|
||||
15
jenkins/docker/Dockerfile.dind
Normal file
15
jenkins/docker/Dockerfile.dind
Normal file
@ -0,0 +1,15 @@
|
||||
# docker buildx build -t tensorrt-llm:{timestamp}_docker_dind_withbash -f jenkins/docker/Dockerfile.dind . --builder=multi-builder --platform linux/arm64,linux/amd64
|
||||
|
||||
FROM docker:dind
|
||||
|
||||
RUN apk add --no-cache bash git make python3 py3-pip
|
||||
|
||||
ENV PATH=/usr/local/cmake/bin:$PATH
|
||||
ENV ENV=${ENV:-/etc/shinit_v2}
|
||||
COPY docker/common/install_cmake.sh install_cmake.sh
|
||||
RUN bash ./install_cmake.sh && rm install_cmake.sh
|
||||
|
||||
RUN git clone https://github.com/rapidsai/rapids-cmake.git /tmp/rapids-cmake && \
|
||||
mkdir -p /usr/local/share/cmake/rapids && \
|
||||
cp -r /tmp/rapids-cmake/rapids-cmake/* /usr/local/share/cmake/rapids/ && \
|
||||
rm -rf /tmp/rapids-cmake
|
||||
Loading…
Reference in New Issue
Block a user