mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][fix] Several minor fixes to CI setting (#9765)
Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
7c6c493993
commit
f59d64e6c7
@ -1639,6 +1639,7 @@ def launchTestListCheck(pipeline)
|
||||
sh "tar -zxf ${tarName}"
|
||||
def llmPath = sh (script: "realpath .", returnStdout: true).trim()
|
||||
def llmSrc = "${llmPath}/TensorRT-LLM/src"
|
||||
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install -r ${llmSrc}/requirements-dev.txt")
|
||||
sh "NVIDIA_TRITON_SERVER_VERSION=25.10 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive"
|
||||
} catch (InterruptedException e) {
|
||||
throw e
|
||||
@ -2903,8 +2904,10 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4],
|
||||
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
|
||||
"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8],
|
||||
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true],
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
|
||||
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true],
|
||||
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true],
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
|
||||
// Perf sanity post merge test
|
||||
// Disable perf stages due to https://nvbugs/5643646
|
||||
// "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
|
||||
@ -2933,7 +2936,8 @@ def launchTestJobs(pipeline, testFilter)
|
||||
fullSet += SBSATestConfigs.keySet()
|
||||
|
||||
SBSASlurmTestConfigs = [
|
||||
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
|
||||
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
|
||||
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
|
||||
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
|
||||
// Disable GB300 stages due to nodes will be offline temporarily.
|
||||
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
|
||||
|
||||
@ -29,10 +29,14 @@ set_value_in_command() {
|
||||
echo "$result"
|
||||
}
|
||||
|
||||
# Only the first process will save the job ID
|
||||
# Only the first process will save the job ID and set the git config
|
||||
if [ $SLURM_PROCID -eq 0 ]; then
|
||||
# Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
|
||||
echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
|
||||
# Update HOME/.gitconfig
|
||||
if ! git config --global --get-all safe.directory | grep -Fxq "*"; then
|
||||
git config --global --add safe.directory "*"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $SLURM_LOCALID -eq 0 ]; then
|
||||
@ -47,7 +51,6 @@ if [ $SLURM_LOCALID -eq 0 ]; then
|
||||
fi
|
||||
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
|
||||
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
|
||||
git config --global --add safe.directory "*"
|
||||
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
|
||||
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
|
||||
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
|
||||
|
||||
@ -23,10 +23,9 @@ MARKER_LIST_IN_TEST = [" TIMEOUT"]
|
||||
|
||||
|
||||
def install_python_dependencies(llm_src):
|
||||
subprocess.run(
|
||||
f"cd {llm_src} && pip3 install --retries 1 -r requirements-dev.txt",
|
||||
shell=True,
|
||||
check=True)
|
||||
subprocess.run(f"cd {llm_src} && pip3 install -r requirements-dev.txt",
|
||||
shell=True,
|
||||
check=True)
|
||||
subprocess.run(
|
||||
f"pip3 install --force-reinstall --no-deps {llm_src}/../tensorrt_llm-*.whl",
|
||||
shell=True,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user