[None][ci] Some improvements for Slurm CI (#7689)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Yanchao Lu 2025-09-14 16:56:32 +08:00 committed by GitHub
parent 1f43854496
commit 89fc136972
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 201 additions and 38 deletions

View File

@ -283,7 +283,7 @@ def buildImage(config, imageKeyToTag)
sh "git config --global --add safe.directory '*'"
withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
sh "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}"
trtllm_utils.llmExecStepWithRetry(this, script: "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}")
}
withCredentials([
@ -294,7 +294,7 @@ def buildImage(config, imageKeyToTag)
),
string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
]) {
sh "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}"
trtllm_utils.llmExecStepWithRetry(this, script: "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}")
}
}
def containerGenFailure = null

View File

@ -105,24 +105,28 @@ REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"]
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false
COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=60 -o ServerAliveCountMax=5"
COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o TCPKeepAlive=no -o ServerAliveInterval=30 -o ServerAliveCountMax=20"
def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [
ip : cluster.ip,
host : cluster.host,
ip : randomLoginNode,
host : randomLoginNode,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
allowAnyHosts: true,
]
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
def downloadSucceed = false
pipeline.stage('Submit Test Results') {
sh "mkdir -p ${stageName}"
def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml"
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
if (downloadSucceed) {
sh "ls ${stageName}"
echo "Upload test results."
@ -136,8 +140,9 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
println("No results xml to submit")
}
}
if (downloadSucceed) {
junit(testResults: "${stageName}/results*.xml")
junit(allowEmptyResults: true, testResults: "${stageName}/results*.xml")
}
}
}
@ -145,9 +150,10 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
//TODO: consolidate slurm related code for both multi nodes and single nodes
def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID, String slurmOutputFile) {
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [
ip : cluster.ip,
host : cluster.host,
ip : randomLoginNode,
host : randomLoginNode,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
allowAnyHosts: true,
@ -207,9 +213,10 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, String slurmJobID) {
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [
ip : cluster.ip,
host : cluster.host,
ip : randomLoginNode,
host : randomLoginNode,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
allowAnyHosts: true,
@ -290,13 +297,15 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
def nodeSecret = CloudManager.createNode(nodeName, customWorkspace)
def slurmJobID = null
def dockerArgs = null
try {
// Run ssh command to start node in desired cluster via SLURM
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [
ip : cluster.ip,
host : cluster.host,
ip : randomLoginNode,
host : randomLoginNode,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
allowAnyHosts: true,
@ -314,6 +323,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
Utils.exec(pipeline, script: "cat ${jenkinsSetupPath}")
Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")
def slurmSubmitOutput = Utils.exec(
pipeline,
timeout: false,
@ -353,9 +364,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
stage('Checking if the Node is Online') {
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [
ip : cluster.ip,
host : cluster.host,
ip : randomLoginNode,
host : randomLoginNode,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
allowAnyHosts: true,
@ -373,8 +385,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
}
if (CloudManager.isNodeOnline(nodeName)) {
def dockerGPUOption = ""
node(nodeName) {
sh """
env | sort
@ -393,7 +403,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
// Dynamically set GPU arguments based on environment variables
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
// It's intentional to check NV_GPU first.
dockerGPUOption = sh(script: """
dockerArgs = sh(script: """
if [ -n "\$NV_GPU" ]; then
echo "--gpus '\\"device=\$NV_GPU\\"'"
elif [ -n "\$CUDA_VISIBLE_DEVICES" ]; then
@ -404,7 +414,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
""", returnStdout: true).trim()
}
def dockerArgs = "${dockerGPUOption} " +
dockerArgs = "${dockerArgs} " +
"--cap-add=SYS_ADMIN " +
"--ipc=host " +
"--entrypoint=\"\" " +
@ -415,18 +425,17 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
"--cap-add=SYSLOG"
echo "Final dockerArgs: ${dockerArgs}"
if (partition.clusterName == "dlcluster") {
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
}
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
echo "Final dockerArgs: ${dockerArgs}"
} else {
error "The Slurm node does not come online in the waiting period. Terminating the job."
}
}
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
} finally {
stage("Clean up SLURM Resources") {
// Workaround to handle the interruption during clean up SLURM resources
@ -473,9 +482,10 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
passwordVariable: 'PASSWORD'
)
]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [
ip : cluster.ip,
host : cluster.host,
ip : randomLoginNode,
host : randomLoginNode,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
allowAnyHosts: true,
@ -545,7 +555,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
def scriptContent = """#!/bin/bash
set -o pipefail
set -Eeuo pipefail
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
export jobWorkspace=$jobWorkspace
export tarName=$tarName
export llmTarfile=$llmTarfile
@ -571,6 +582,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
}
stage('Run Test') {
Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")
Utils.exec(
pipeline,
timeout: false,
@ -1940,14 +1953,18 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
stage('Pull Docker Image') {
docker.image(image).pull()
}
docker.image(image).inside(dockerArgs) {
runner()
// We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
// The timeout here is to avoid the Slurm job being stuck.
timeout(time: SlurmConfig.DEFAULT_TIMEOUT, unit: 'MINUTES') {
docker.image(image).inside(dockerArgs) {
runner()
}
}
} catch (Exception e) {
if (e.getMessage()?.contains("Failed to kill container")) {
echo "Known benign error ignored: ${e.getMessage()}"
} else {
throw e // Re-throw if it's a different IOException
throw e // Re-throw if it's a different Exception
}
}
}
@ -2128,10 +2145,11 @@ def launchTestJobs(pipeline, testFilter)
multiNodesSBSAConfigs = [
// Each stage test 1 testcase with 8 GPUs and 2 nodes.
"GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
// Disable GB200 multi-node testing in L0 pre-merge until the configuration issue is resolved (https://nvbugs/5455140)
// "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
// "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
// "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
// "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 5, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 5, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 5, 8, 2],

View File

@ -1,4 +1,9 @@
#!/bin/bash
# Set up error handling
set -Eeuo pipefail
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
cd $resourcePathNode
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
@ -27,21 +32,25 @@ if [ $SLURM_LOCALID -eq 0 ]; then
cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
git config --global --add safe.directory "*"
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
echo "HOST_NODE_NAME = $HOST_NODE_NAME ; GPU_UUIDS = =$gpuUuids ; STAGE_NAME = $stageName"
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
touch install_lock.lock
else
while [ ! -f install_lock.lock ]; do
sleep 5
done
fi
testList="$testList_$splitId"
export CPP_TEST_TIMEOUT_OVERRIDDEN=$pytestTestTimeout
export LLM_ROOT=$llmSrcNode
export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
export UCX_TLS=^gdr_copy
# TODO: Move back to tensorrt_llm/llmapi/trtllm-llmapi-launch later
llmapiLaunchScript="$llmSrcNode/jenkins/scripts/trtllm-llmapi-launch"
chmod +x $llmapiLaunchScript
cd $llmSrcNode/tests/integration/defs
testCmdLines=(
"$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
"$llmapiLaunchScript"
"pytest"
"-v"
"--timeout-method=thread"
@ -88,6 +97,13 @@ echo "Library Path:"
echo "$LD_LIBRARY_PATH"
env | sort
fullCmd="${testCmdLines[*]}"
echo "Running: $testCase"
echo "Full Command: $fullCmd"
# Turn off "exit on error" so the following lines always run
set +e
trap - ERR
eval $fullCmd
exitCode=$?
echo "Pytest exit code: $exitCode"
exit $exitCode

View File

@ -0,0 +1,129 @@
#!/bin/bash
set -Eeo pipefail
task_with_command=("$@")
native_mpi_rank=$OMPI_COMM_WORLD_RANK
mpi_rank=${SLURM_PROCID:-${OMPI_COMM_WORLD_RANK:-${PMI_RANK:-${PMI_ID:-0}}}}
log_stderr() { echo -e "\033[33m$@\033[0m" >&2; }
log_stderr "mpi_rank: $mpi_rank"
pid=$(ps -o pid= -p $$ | tr -d ' ')
# Tell TRTLLM to spawn a additional process for the Proxy
export TLLM_SPAWN_PROXY_PROCESS=1
function mpi_world_size {
if [ -n "$SLURM_NTASKS" ]; then
echo "$SLURM_NTASKS"
elif [ -n "$OMPI_COMM_WORLD_SIZE" ]; then
echo "$OMPI_COMM_WORLD_SIZE"
else
echo "1"
fi
}
function export_free_tcp_addr_for_spawn_proxy_process {
# find free port starting from 10012
local free_port=$(python -c 'import socket; s=socket.socket();
port = 10012
while True:
try:
s.bind(("", port))
break
except OSError:
port += 1
print(port); s.close()')
export TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR="tcp://127.0.0.1:${free_port}"
log_stderr "TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR: $TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR"
export TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY=$(openssl rand -hex 32)
}
export tllm_mpi_size=$(mpi_world_size)
log_stderr "tllm_mpi_size: $tllm_mpi_size"
export_free_tcp_addr_for_spawn_proxy_process
if [ -z "$mpi_rank" ] || [ "$mpi_rank" -eq 0 ]; then
log_stderr "rank${mpi_rank} run ${task_with_command[@]} in background"
# MPI doesn't allow spawn a process sharing the MPI environment in a MPI
# process, or duplicate MPI_Init in the child process will cause undefined
# behavior. Thus we need to clean the MPI environment in the parent process
# before spawning the child process, and restore the MPI environment later
# before running MPI operations in the parent process.
mpi_blacklist=(
OMPI_ PMIX_ PMI_ SLURM_ MPI_ UCX_
I_MPI_ HYDRA_ KMP_ MPICH_ MV2_ CRAY_
)
(
# Remove MPI-related variables only in the subshell context
for var in $(compgen -e); do
for prefix in "${mpi_blacklist[@]}"; do
if [[ "$var" == "$prefix"* ]]; then
unset "$var"
break
fi
done
done
# Turn off "exit on error" so the following lines always run
set +e
# Execute the task with cleaned environment
"${task_with_command[@]}"
task_exit_code=$?
echo "Task exit code: $task_exit_code"
# Stop the MPI Comm server
python3 -m tensorrt_llm.llmapi.mgmn_leader_node --action stop
mpi_exit_code=$?
echo "MPI Comm server exit code: $mpi_exit_code"
# Propagate task exit status
if [ $task_exit_code -ne 0 ]; then
exit $task_exit_code
else
exit $mpi_exit_code
fi
) 1>&2 &
# Turn off "exit on error" so the following lines always run
set +e
# Capture subshell PID
subshell_pid=$!
echo "Subshell PID: $subshell_pid"
log_stderr "rank${mpi_rank} run mgmn leader node with mpi_world_size: $(mpi_world_size) ..."
log_stderr "rank0 host: $HOSTNAME"
python3 -m tensorrt_llm.llmapi.mgmn_leader_node
mgmn_leader_node_exit_code=$?
echo "MGMN leader node exit code: $mgmn_leader_node_exit_code"
# Wait for subshell
wait $subshell_pid
# This is subshell's exit code
subshell_exit_code=$?
echo "Subshell exit code: $subshell_exit_code"
# Propagate subshell exit status
if [ $subshell_exit_code -ne 0 ]; then
exit $subshell_exit_code
else
exit $mgmn_leader_node_exit_code
fi
else
# Turn off "exit on error" so the following lines always run
set +e
log_stderr "rank${mpi_rank} run mgmn worker node with mpi_world_size: $(mpi_world_size) ..."
python3 -m tensorrt_llm.llmapi.mgmn_worker_node
mgmn_worker_node_exit_code=$?
echo "MGMN worker node exit code: $mgmn_worker_node_exit_code"
exit $mgmn_worker_node_exit_code
fi