[Infra] - Always use x86 image for the Jenkins agent and few clean-ups (#5753)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-07-06 10:25:57 +08:00 · 2025-07-06 10:25:57 +08:00 · d95ae1378b
commit d95ae1378b
parent 6bddaf6df6
4 changed files with 18 additions and 45 deletions
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@ -16,7 +16,8 @@ AARCH64_TRIPLE = "aarch64-linux-gnu"

 LLM_DOCKER_IMAGE = env.dockerImage

-AGENT_IMAGE = env.dockerImage
+// Always use x86_64 image for agent
+AGENT_IMAGE = env.dockerImage.replace("aarch64", "x86_64")

 POD_TIMEOUT_SECONDS = env.podTimeoutSeconds ? env.podTimeoutSeconds : "21600"
 POD_TIMEOUT_SECONDS_TMP = env.podTimeoutSeconds ? env.podTimeoutSeconds : "43200"
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@ -44,12 +44,6 @@ def getContainerURIs()
    return uris
 }

-// TODO: Move common variables to an unified location
-BUILD_CORES_REQUEST = "8"
-BUILD_CORES_LIMIT = "8"
-BUILD_MEMORY_REQUEST = "48Gi"
-BUILD_MEMORY_LIMIT = "48Gi"
-
 // Stage choices
 STAGE_CHOICE_NORMAL = "normal"
 STAGE_CHOICE_SKIP = "skip"
@ -214,37 +208,15 @@ def createKubernetesPodConfig(image, type, arch = "amd64")
                    resources:
                      requests:
                        cpu: '2'
-                        memory: 10Gi
+                        memory: 5Gi
                        ephemeral-storage: 25Gi
                      limits:
                        cpu: '2'
-                        memory: 10Gi
+                        memory: 5Gi
                        ephemeral-storage: 25Gi
                    imagePullPolicy: Always"""
        nodeLabelPrefix = "cpu"
        break
-    case "build":
-        containerConfig = """
-                  - name: trt-llm
-                    image: ${image}
-                    command: ['cat']
-                    volumeMounts:
-                    - name: sw-tensorrt-pvc
-                      mountPath: "/mnt/sw-tensorrt-pvc"
-                      readOnly: false
-                    tty: true
-                    resources:
-                      requests:
-                        cpu: ${BUILD_CORES_REQUEST}
-                        memory: ${BUILD_MEMORY_REQUEST}
-                        ephemeral-storage: 200Gi
-                      limits:
-                        cpu: ${BUILD_CORES_LIMIT}
-                        memory: ${BUILD_MEMORY_LIMIT}
-                        ephemeral-storage: 200Gi
-                    imagePullPolicy: Always"""
-        nodeLabelPrefix = "cpu"
-        break
    case "package":
        containerConfig = """
                  - name: trt-llm
@ -254,11 +226,11 @@ def createKubernetesPodConfig(image, type, arch = "amd64")
                    resources:
                      requests:
                        cpu: '2'
-                        memory: 10Gi
+                        memory: 5Gi
                        ephemeral-storage: 25Gi
                      limits:
                        cpu: '2'
-                        memory: 10Gi
+                        memory: 5Gi
                        ephemeral-storage: 25Gi
                    imagePullPolicy: Always"""
        nodeLabelPrefix = "cpu"
@ -299,11 +271,11 @@ def createKubernetesPodConfig(image, type, arch = "amd64")
                    resources:
                      requests:
                        cpu: '2'
-                        memory: 10Gi
+                        memory: 5Gi
                        ephemeral-storage: 25Gi
                      limits:
                        cpu: '2'
-                        memory: 10Gi
+                        memory: 5Gi
                        ephemeral-storage: 25Gi
                qosClass: Guaranteed
                volumes:
@ -327,7 +299,7 @@ def echoNodeAndGpuInfo(pipeline, stageName)
 def setupPipelineEnvironment(pipeline, testFilter, globalVars)
 {
    image = "urm.nvidia.com/docker/golang:1.22"
-    setupPipelineSpec = createKubernetesPodConfig(image, "build")
+    setupPipelineSpec = createKubernetesPodConfig(image, "package")
    trtllm_utils.launchKubernetesPod(pipeline, setupPipelineSpec, "trt-llm", {
        sh "env | sort"
        updateGitlabCommitStatus name: "${BUILD_STATUS_NAME}", state: 'running'
@ -413,7 +385,7 @@ def launchReleaseCheck(pipeline)

    def image = "urm.nvidia.com/docker/golang:1.22"
    stageName = "Release Check"
-    trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "build"), "trt-llm", {
+    trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "package"), "trt-llm", {
        stage("[${stageName}] Run") {
            if (RELESE_CHECK_CHOICE == STAGE_CHOICE_SKIP) {
                echo "Release Check job is skipped due to Jenkins configuration"
--- a/jenkins/controlCCache.groovy
+++ b/jenkins/controlCCache.groovy
@ -34,11 +34,11 @@ def createKubernetesPodConfig(image, arch = "amd64")
                    resources:
                      requests:
                        cpu: 2
-                        memory: 10Gi
+                        memory: 5Gi
                        ephemeral-storage: 25Gi
                      limits:
                        cpu: 2
-                        memory: 10Gi
+                        memory: 5Gi
                        ephemeral-storage: 25Gi
                    imagePullPolicy: Always
                  - name: jnlp
@ -47,11 +47,11 @@ def createKubernetesPodConfig(image, arch = "amd64")
                    resources:
                      requests:
                        cpu: '2'
-                        memory: 10Gi
+                        memory: 5Gi
                        ephemeral-storage: 25Gi
                      limits:
                        cpu: '2'
-                        memory: 10Gi
+                        memory: 5Gi
                        ephemeral-storage: 25Gi
                qosClass: Guaranteed
                volumes:
@ -119,10 +119,10 @@ pipeline {
                              case "Reset":
                                sh "rm -rf ${CCACHE_DIR}"
                                sh "mkdir -p ${CCACHE_DIR}"
-                                sh "printf \"max_size=300G\ntemporary_dir=/tmp/ccache\ncompression = true\n\" > ${CCACHE_DIR}/ccache.conf"
+                                sh "printf 'max_size=500G\ntemporary_dir=/tmp/ccache\ncompression=true\nbase_dir=/home/jenkins/agent/workspace/LLM\nsloppiness=file_macro,time_macros,pch_defines\n' > ${CCACHE_DIR}/ccache.conf"
                                break
                              case "Config":
-                                sh "printf \"max_size=300G\ntemporary_dir=/tmp/ccache\ncompression = true\n\" > ${CCACHE_DIR}/ccache.conf"
+                                sh "printf 'max_size=500G\ntemporary_dir=/tmp/ccache\ncompression=true\nbase_dir=/home/jenkins/agent/workspace/LLM\nsloppiness=file_macro,time_macros,pch_defines\n' > ${CCACHE_DIR}/ccache.conf"
                                break
                              case "Stats":
                                sh "ccache -sv"
--- a/requirements.txt
+++ b/requirements.txt
@ -13,9 +13,9 @@ onnx_graphsurgeon>=0.5.2
 openai
 polygraphy
 psutil
-nvidia-ml-py>=12
+nvidia-ml-py>=12,<13
 # Just a wrapper since nvidia-modelopt requires pynvml
-pynvml>=12.0.0
+pynvml==12.0.0
 pulp
 pandas
 h5py==3.12.1