[Infra] - Always use x86 image for the Jenkins agent and few clean-ups (#5753)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Yanchao Lu 2025-07-06 10:25:57 +08:00 committed by GitHub
parent 6bddaf6df6
commit d95ae1378b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 18 additions and 45 deletions

View File

@ -16,7 +16,8 @@ AARCH64_TRIPLE = "aarch64-linux-gnu"
LLM_DOCKER_IMAGE = env.dockerImage
AGENT_IMAGE = env.dockerImage
// Always use x86_64 image for agent
AGENT_IMAGE = env.dockerImage.replace("aarch64", "x86_64")
POD_TIMEOUT_SECONDS = env.podTimeoutSeconds ? env.podTimeoutSeconds : "21600"
POD_TIMEOUT_SECONDS_TMP = env.podTimeoutSeconds ? env.podTimeoutSeconds : "43200"

View File

@ -44,12 +44,6 @@ def getContainerURIs()
return uris
}
// TODO: Move common variables to an unified location
BUILD_CORES_REQUEST = "8"
BUILD_CORES_LIMIT = "8"
BUILD_MEMORY_REQUEST = "48Gi"
BUILD_MEMORY_LIMIT = "48Gi"
// Stage choices
STAGE_CHOICE_NORMAL = "normal"
STAGE_CHOICE_SKIP = "skip"
@ -214,37 +208,15 @@ def createKubernetesPodConfig(image, type, arch = "amd64")
resources:
requests:
cpu: '2'
memory: 10Gi
memory: 5Gi
ephemeral-storage: 25Gi
limits:
cpu: '2'
memory: 10Gi
memory: 5Gi
ephemeral-storage: 25Gi
imagePullPolicy: Always"""
nodeLabelPrefix = "cpu"
break
case "build":
containerConfig = """
- name: trt-llm
image: ${image}
command: ['cat']
volumeMounts:
- name: sw-tensorrt-pvc
mountPath: "/mnt/sw-tensorrt-pvc"
readOnly: false
tty: true
resources:
requests:
cpu: ${BUILD_CORES_REQUEST}
memory: ${BUILD_MEMORY_REQUEST}
ephemeral-storage: 200Gi
limits:
cpu: ${BUILD_CORES_LIMIT}
memory: ${BUILD_MEMORY_LIMIT}
ephemeral-storage: 200Gi
imagePullPolicy: Always"""
nodeLabelPrefix = "cpu"
break
case "package":
containerConfig = """
- name: trt-llm
@ -254,11 +226,11 @@ def createKubernetesPodConfig(image, type, arch = "amd64")
resources:
requests:
cpu: '2'
memory: 10Gi
memory: 5Gi
ephemeral-storage: 25Gi
limits:
cpu: '2'
memory: 10Gi
memory: 5Gi
ephemeral-storage: 25Gi
imagePullPolicy: Always"""
nodeLabelPrefix = "cpu"
@ -299,11 +271,11 @@ def createKubernetesPodConfig(image, type, arch = "amd64")
resources:
requests:
cpu: '2'
memory: 10Gi
memory: 5Gi
ephemeral-storage: 25Gi
limits:
cpu: '2'
memory: 10Gi
memory: 5Gi
ephemeral-storage: 25Gi
qosClass: Guaranteed
volumes:
@ -327,7 +299,7 @@ def echoNodeAndGpuInfo(pipeline, stageName)
def setupPipelineEnvironment(pipeline, testFilter, globalVars)
{
image = "urm.nvidia.com/docker/golang:1.22"
setupPipelineSpec = createKubernetesPodConfig(image, "build")
setupPipelineSpec = createKubernetesPodConfig(image, "package")
trtllm_utils.launchKubernetesPod(pipeline, setupPipelineSpec, "trt-llm", {
sh "env | sort"
updateGitlabCommitStatus name: "${BUILD_STATUS_NAME}", state: 'running'
@ -413,7 +385,7 @@ def launchReleaseCheck(pipeline)
def image = "urm.nvidia.com/docker/golang:1.22"
stageName = "Release Check"
trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "build"), "trt-llm", {
trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "package"), "trt-llm", {
stage("[${stageName}] Run") {
if (RELESE_CHECK_CHOICE == STAGE_CHOICE_SKIP) {
echo "Release Check job is skipped due to Jenkins configuration"

View File

@ -34,11 +34,11 @@ def createKubernetesPodConfig(image, arch = "amd64")
resources:
requests:
cpu: 2
memory: 10Gi
memory: 5Gi
ephemeral-storage: 25Gi
limits:
cpu: 2
memory: 10Gi
memory: 5Gi
ephemeral-storage: 25Gi
imagePullPolicy: Always
- name: jnlp
@ -47,11 +47,11 @@ def createKubernetesPodConfig(image, arch = "amd64")
resources:
requests:
cpu: '2'
memory: 10Gi
memory: 5Gi
ephemeral-storage: 25Gi
limits:
cpu: '2'
memory: 10Gi
memory: 5Gi
ephemeral-storage: 25Gi
qosClass: Guaranteed
volumes:
@ -119,10 +119,10 @@ pipeline {
case "Reset":
sh "rm -rf ${CCACHE_DIR}"
sh "mkdir -p ${CCACHE_DIR}"
sh "printf \"max_size=300G\ntemporary_dir=/tmp/ccache\ncompression = true\n\" > ${CCACHE_DIR}/ccache.conf"
sh "printf 'max_size=500G\ntemporary_dir=/tmp/ccache\ncompression=true\nbase_dir=/home/jenkins/agent/workspace/LLM\nsloppiness=file_macro,time_macros,pch_defines\n' > ${CCACHE_DIR}/ccache.conf"
break
case "Config":
sh "printf \"max_size=300G\ntemporary_dir=/tmp/ccache\ncompression = true\n\" > ${CCACHE_DIR}/ccache.conf"
sh "printf 'max_size=500G\ntemporary_dir=/tmp/ccache\ncompression=true\nbase_dir=/home/jenkins/agent/workspace/LLM\nsloppiness=file_macro,time_macros,pch_defines\n' > ${CCACHE_DIR}/ccache.conf"
break
case "Stats":
sh "ccache -sv"

View File

@ -13,9 +13,9 @@ onnx_graphsurgeon>=0.5.2
openai
polygraphy
psutil
nvidia-ml-py>=12
nvidia-ml-py>=12,<13
# Just a wrapper since nvidia-modelopt requires pynvml
pynvml>=12.0.0
pynvml==12.0.0
pulp
pandas
h5py==3.12.1