[None][fix] Several minor fixes to CI setting (#9765)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-12-07 23:07:59 +08:00 · 2025-12-07 23:07:59 +08:00 · f59d64e6c7
commit f59d64e6c7
parent 7c6c493993
3 changed files with 15 additions and 9 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -1639,6 +1639,7 @@ def launchTestListCheck(pipeline)
            sh "tar -zxf ${tarName}"
            def llmPath = sh (script: "realpath .", returnStdout: true).trim()
            def llmSrc = "${llmPath}/TensorRT-LLM/src"
+            trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install -r ${llmSrc}/requirements-dev.txt")
            sh "NVIDIA_TRITON_SERVER_VERSION=25.10 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive"
        } catch (InterruptedException e) {
            throw e
@ -2903,8 +2904,10 @@ def launchTestJobs(pipeline, testFilter)
        "DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4],
        "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
        "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8],
-        "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true],
-        "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
+        "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true],
+        "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true],
+        "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
+        "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
        // Perf sanity post merge test
        // Disable perf stages due to https://nvbugs/5643646
        // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
@ -2933,7 +2936,8 @@ def launchTestJobs(pipeline, testFilter)
    fullSet += SBSATestConfigs.keySet()

    SBSASlurmTestConfigs = [
-        "GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
+        "GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
+        "GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
        "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
        // Disable GB300 stages due to nodes will be offline temporarily.
        // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@ -29,10 +29,14 @@ set_value_in_command() {
    echo "$result"
 }

-# Only the first process will save the job ID
+# Only the first process will save the job ID and set the git config
 if [ $SLURM_PROCID -eq 0 ]; then
    # Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
    echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
+    # Update HOME/.gitconfig
+    if ! git config --global --get-all safe.directory | grep -Fxq "*"; then
+        git config --global --add safe.directory "*"
+    fi
 fi

 if [ $SLURM_LOCALID -eq 0 ]; then
@ -47,7 +51,6 @@ if [ $SLURM_LOCALID -eq 0 ]; then
    fi
    cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
    cd $resourcePathNode &&  pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
-    git config --global --add safe.directory "*"
    gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
    hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
    echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
--- a/scripts/check_test_list.py
+++ b/scripts/check_test_list.py
@ -23,10 +23,9 @@ MARKER_LIST_IN_TEST = [" TIMEOUT"]


 def install_python_dependencies(llm_src):
-    subprocess.run(
-        f"cd {llm_src} && pip3 install --retries 1 -r requirements-dev.txt",
-        shell=True,
-        check=True)
+    subprocess.run(f"cd {llm_src} && pip3 install -r requirements-dev.txt",
+                   shell=True,
+                   check=True)
    subprocess.run(
        f"pip3 install --force-reinstall --no-deps {llm_src}/../tensorrt_llm-*.whl",
        shell=True,