[None][fix] Several minor fixes to CI setting (#9765)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Yanchao Lu 2025-12-07 23:07:59 +08:00 committed by GitHub
parent 7c6c493993
commit f59d64e6c7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 15 additions and 9 deletions

View File

@ -1639,6 +1639,7 @@ def launchTestListCheck(pipeline)
sh "tar -zxf ${tarName}"
def llmPath = sh (script: "realpath .", returnStdout: true).trim()
def llmSrc = "${llmPath}/TensorRT-LLM/src"
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install -r ${llmSrc}/requirements-dev.txt")
sh "NVIDIA_TRITON_SERVER_VERSION=25.10 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive"
} catch (InterruptedException e) {
throw e
@ -2903,8 +2904,10 @@ def launchTestJobs(pipeline, testFilter)
"DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4],
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
// Perf sanity post merge test
// Disable perf stages due to https://nvbugs/5643646
// "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
@ -2933,7 +2936,8 @@ def launchTestJobs(pipeline, testFilter)
fullSet += SBSATestConfigs.keySet()
SBSASlurmTestConfigs = [
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
// Disable GB300 stages due to nodes will be offline temporarily.
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],

View File

@ -29,10 +29,14 @@ set_value_in_command() {
echo "$result"
}
# Only the first process will save the job ID
# Only the first process will save the job ID and set the git config
if [ $SLURM_PROCID -eq 0 ]; then
# Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
# Update HOME/.gitconfig
if ! git config --global --get-all safe.directory | grep -Fxq "*"; then
git config --global --add safe.directory "*"
fi
fi
if [ $SLURM_LOCALID -eq 0 ]; then
@ -47,7 +51,6 @@ if [ $SLURM_LOCALID -eq 0 ]; then
fi
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
git config --global --add safe.directory "*"
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"

View File

@ -23,10 +23,9 @@ MARKER_LIST_IN_TEST = [" TIMEOUT"]
def install_python_dependencies(llm_src):
subprocess.run(
f"cd {llm_src} && pip3 install --retries 1 -r requirements-dev.txt",
shell=True,
check=True)
subprocess.run(f"cd {llm_src} && pip3 install -r requirements-dev.txt",
shell=True,
check=True)
subprocess.run(
f"pip3 install --force-reinstall --no-deps {llm_src}/../tensorrt_llm-*.whl",
shell=True,