mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[None][ci] Some improvements for Slurm CI (#7689)
Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
1f43854496
commit
89fc136972
@ -283,7 +283,7 @@ def buildImage(config, imageKeyToTag)
|
||||
sh "git config --global --add safe.directory '*'"
|
||||
|
||||
withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
sh "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}"
|
||||
trtllm_utils.llmExecStepWithRetry(this, script: "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}")
|
||||
}
|
||||
|
||||
withCredentials([
|
||||
@ -294,7 +294,7 @@ def buildImage(config, imageKeyToTag)
|
||||
),
|
||||
string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
|
||||
]) {
|
||||
sh "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}"
|
||||
trtllm_utils.llmExecStepWithRetry(this, script: "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}")
|
||||
}
|
||||
}
|
||||
def containerGenFailure = null
|
||||
|
||||
@ -105,24 +105,28 @@ REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"]
|
||||
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
|
||||
ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false
|
||||
|
||||
COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=60 -o ServerAliveCountMax=5"
|
||||
COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o TCPKeepAlive=no -o ServerAliveInterval=30 -o ServerAliveCountMax=20"
|
||||
|
||||
def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
|
||||
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
|
||||
def remote = [
|
||||
ip : cluster.ip,
|
||||
host : cluster.host,
|
||||
ip : randomLoginNode,
|
||||
host : randomLoginNode,
|
||||
user : "${pipeline.USERNAME}",
|
||||
passwd : "${pipeline.PASSWORD}",
|
||||
allowAnyHosts: true,
|
||||
]
|
||||
|
||||
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
|
||||
|
||||
def downloadSucceed = false
|
||||
|
||||
pipeline.stage('Submit Test Results') {
|
||||
sh "mkdir -p ${stageName}"
|
||||
def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml"
|
||||
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
|
||||
def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
|
||||
downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
|
||||
if (downloadSucceed) {
|
||||
sh "ls ${stageName}"
|
||||
echo "Upload test results."
|
||||
@ -136,8 +140,9 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
|
||||
println("No results xml to submit")
|
||||
}
|
||||
}
|
||||
|
||||
if (downloadSucceed) {
|
||||
junit(testResults: "${stageName}/results*.xml")
|
||||
junit(allowEmptyResults: true, testResults: "${stageName}/results*.xml")
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -145,9 +150,10 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
|
||||
//TODO: consolidate slurm related code for both multi nodes and single nodes
|
||||
def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID, String slurmOutputFile) {
|
||||
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
|
||||
def remote = [
|
||||
ip : cluster.ip,
|
||||
host : cluster.host,
|
||||
ip : randomLoginNode,
|
||||
host : randomLoginNode,
|
||||
user : "${pipeline.USERNAME}",
|
||||
passwd : "${pipeline.PASSWORD}",
|
||||
allowAnyHosts: true,
|
||||
@ -207,9 +213,10 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
|
||||
|
||||
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, String slurmJobID) {
|
||||
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
|
||||
def remote = [
|
||||
ip : cluster.ip,
|
||||
host : cluster.host,
|
||||
ip : randomLoginNode,
|
||||
host : randomLoginNode,
|
||||
user : "${pipeline.USERNAME}",
|
||||
passwd : "${pipeline.PASSWORD}",
|
||||
allowAnyHosts: true,
|
||||
@ -290,13 +297,15 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
def nodeSecret = CloudManager.createNode(nodeName, customWorkspace)
|
||||
|
||||
def slurmJobID = null
|
||||
def dockerArgs = null
|
||||
|
||||
try {
|
||||
// Run ssh command to start node in desired cluster via SLURM
|
||||
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
|
||||
def remote = [
|
||||
ip : cluster.ip,
|
||||
host : cluster.host,
|
||||
ip : randomLoginNode,
|
||||
host : randomLoginNode,
|
||||
user : "${pipeline.USERNAME}",
|
||||
passwd : "${pipeline.PASSWORD}",
|
||||
allowAnyHosts: true,
|
||||
@ -314,6 +323,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
|
||||
Utils.exec(pipeline, script: "cat ${jenkinsSetupPath}")
|
||||
|
||||
Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")
|
||||
|
||||
def slurmSubmitOutput = Utils.exec(
|
||||
pipeline,
|
||||
timeout: false,
|
||||
@ -353,9 +364,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
|
||||
stage('Checking if the Node is Online') {
|
||||
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
|
||||
def remote = [
|
||||
ip : cluster.ip,
|
||||
host : cluster.host,
|
||||
ip : randomLoginNode,
|
||||
host : randomLoginNode,
|
||||
user : "${pipeline.USERNAME}",
|
||||
passwd : "${pipeline.PASSWORD}",
|
||||
allowAnyHosts: true,
|
||||
@ -373,8 +385,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
}
|
||||
|
||||
if (CloudManager.isNodeOnline(nodeName)) {
|
||||
def dockerGPUOption = ""
|
||||
|
||||
node(nodeName) {
|
||||
sh """
|
||||
env | sort
|
||||
@ -393,7 +403,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
// Dynamically set GPU arguments based on environment variables
|
||||
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
|
||||
// It's intentional to check NV_GPU first.
|
||||
dockerGPUOption = sh(script: """
|
||||
dockerArgs = sh(script: """
|
||||
if [ -n "\$NV_GPU" ]; then
|
||||
echo "--gpus '\\"device=\$NV_GPU\\"'"
|
||||
elif [ -n "\$CUDA_VISIBLE_DEVICES" ]; then
|
||||
@ -404,7 +414,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
""", returnStdout: true).trim()
|
||||
}
|
||||
|
||||
def dockerArgs = "${dockerGPUOption} " +
|
||||
dockerArgs = "${dockerArgs} " +
|
||||
"--cap-add=SYS_ADMIN " +
|
||||
"--ipc=host " +
|
||||
"--entrypoint=\"\" " +
|
||||
@ -415,18 +425,17 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
|
||||
"--cap-add=SYSLOG"
|
||||
|
||||
echo "Final dockerArgs: ${dockerArgs}"
|
||||
|
||||
if (partition.clusterName == "dlcluster") {
|
||||
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
|
||||
}
|
||||
|
||||
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
|
||||
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
|
||||
echo "Final dockerArgs: ${dockerArgs}"
|
||||
} else {
|
||||
error "The Slurm node does not come online in the waiting period. Terminating the job."
|
||||
}
|
||||
}
|
||||
|
||||
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
|
||||
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
|
||||
} finally {
|
||||
stage("Clean up SLURM Resources") {
|
||||
// Workaround to handle the interruption during clean up SLURM resources
|
||||
@ -473,9 +482,10 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
|
||||
passwordVariable: 'PASSWORD'
|
||||
)
|
||||
]) {
|
||||
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
|
||||
def remote = [
|
||||
ip : cluster.ip,
|
||||
host : cluster.host,
|
||||
ip : randomLoginNode,
|
||||
host : randomLoginNode,
|
||||
user : "${pipeline.USERNAME}",
|
||||
passwd : "${pipeline.PASSWORD}",
|
||||
allowAnyHosts: true,
|
||||
@ -545,7 +555,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
|
||||
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
|
||||
def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
|
||||
def scriptContent = """#!/bin/bash
|
||||
set -o pipefail
|
||||
set -Eeuo pipefail
|
||||
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
|
||||
export jobWorkspace=$jobWorkspace
|
||||
export tarName=$tarName
|
||||
export llmTarfile=$llmTarfile
|
||||
@ -571,6 +582,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
|
||||
}
|
||||
|
||||
stage('Run Test') {
|
||||
Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")
|
||||
|
||||
Utils.exec(
|
||||
pipeline,
|
||||
timeout: false,
|
||||
@ -1940,14 +1953,18 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
|
||||
stage('Pull Docker Image') {
|
||||
docker.image(image).pull()
|
||||
}
|
||||
docker.image(image).inside(dockerArgs) {
|
||||
runner()
|
||||
// We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
|
||||
// The timeout here is to avoid the Slurm job being stuck.
|
||||
timeout(time: SlurmConfig.DEFAULT_TIMEOUT, unit: 'MINUTES') {
|
||||
docker.image(image).inside(dockerArgs) {
|
||||
runner()
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (e.getMessage()?.contains("Failed to kill container")) {
|
||||
echo "Known benign error ignored: ${e.getMessage()}"
|
||||
} else {
|
||||
throw e // Re-throw if it's a different IOException
|
||||
throw e // Re-throw if it's a different Exception
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2128,10 +2145,11 @@ def launchTestJobs(pipeline, testFilter)
|
||||
|
||||
multiNodesSBSAConfigs = [
|
||||
// Each stage test 1 testcase with 8 GPUs and 2 nodes.
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
|
||||
// Disable GB200 multi-node testing in L0 pre-merge until the configuration issue is resolved (https://nvbugs/5455140)
|
||||
// "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
|
||||
// "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
|
||||
// "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
|
||||
// "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 5, 8, 2],
|
||||
|
||||
@ -1,4 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Set up error handling
|
||||
set -Eeuo pipefail
|
||||
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
|
||||
|
||||
cd $resourcePathNode
|
||||
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
|
||||
|
||||
@ -27,21 +32,25 @@ if [ $SLURM_LOCALID -eq 0 ]; then
|
||||
cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
|
||||
git config --global --add safe.directory "*"
|
||||
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
|
||||
echo "HOST_NODE_NAME = $HOST_NODE_NAME ; GPU_UUIDS = =$gpuUuids ; STAGE_NAME = $stageName"
|
||||
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
|
||||
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
|
||||
touch install_lock.lock
|
||||
else
|
||||
while [ ! -f install_lock.lock ]; do
|
||||
sleep 5
|
||||
done
|
||||
fi
|
||||
testList="$testList_$splitId"
|
||||
export CPP_TEST_TIMEOUT_OVERRIDDEN=$pytestTestTimeout
|
||||
export LLM_ROOT=$llmSrcNode
|
||||
export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
|
||||
export UCX_TLS=^gdr_copy
|
||||
|
||||
# TODO: Move back to tensorrt_llm/llmapi/trtllm-llmapi-launch later
|
||||
llmapiLaunchScript="$llmSrcNode/jenkins/scripts/trtllm-llmapi-launch"
|
||||
chmod +x $llmapiLaunchScript
|
||||
cd $llmSrcNode/tests/integration/defs
|
||||
testCmdLines=(
|
||||
"$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
|
||||
"$llmapiLaunchScript"
|
||||
"pytest"
|
||||
"-v"
|
||||
"--timeout-method=thread"
|
||||
@ -88,6 +97,13 @@ echo "Library Path:"
|
||||
echo "$LD_LIBRARY_PATH"
|
||||
env | sort
|
||||
fullCmd="${testCmdLines[*]}"
|
||||
echo "Running: $testCase"
|
||||
echo "Full Command: $fullCmd"
|
||||
|
||||
# Turn off "exit on error" so the following lines always run
|
||||
set +e
|
||||
trap - ERR
|
||||
|
||||
eval $fullCmd
|
||||
exitCode=$?
|
||||
echo "Pytest exit code: $exitCode"
|
||||
exit $exitCode
|
||||
|
||||
129
jenkins/scripts/trtllm-llmapi-launch
Normal file
129
jenkins/scripts/trtllm-llmapi-launch
Normal file
@ -0,0 +1,129 @@
|
||||
#!/bin/bash
|
||||
set -Eeo pipefail
|
||||
|
||||
task_with_command=("$@")
|
||||
native_mpi_rank=$OMPI_COMM_WORLD_RANK
|
||||
mpi_rank=${SLURM_PROCID:-${OMPI_COMM_WORLD_RANK:-${PMI_RANK:-${PMI_ID:-0}}}}
|
||||
|
||||
log_stderr() { echo -e "\033[33m$@\033[0m" >&2; }
|
||||
log_stderr "mpi_rank: $mpi_rank"
|
||||
|
||||
pid=$(ps -o pid= -p $$ | tr -d ' ')
|
||||
|
||||
# Tell TRTLLM to spawn a additional process for the Proxy
|
||||
export TLLM_SPAWN_PROXY_PROCESS=1
|
||||
|
||||
function mpi_world_size {
|
||||
if [ -n "$SLURM_NTASKS" ]; then
|
||||
echo "$SLURM_NTASKS"
|
||||
elif [ -n "$OMPI_COMM_WORLD_SIZE" ]; then
|
||||
echo "$OMPI_COMM_WORLD_SIZE"
|
||||
else
|
||||
echo "1"
|
||||
fi
|
||||
}
|
||||
|
||||
function export_free_tcp_addr_for_spawn_proxy_process {
|
||||
# find free port starting from 10012
|
||||
local free_port=$(python -c 'import socket; s=socket.socket();
|
||||
port = 10012
|
||||
while True:
|
||||
try:
|
||||
s.bind(("", port))
|
||||
break
|
||||
except OSError:
|
||||
port += 1
|
||||
print(port); s.close()')
|
||||
export TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR="tcp://127.0.0.1:${free_port}"
|
||||
log_stderr "TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR: $TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR"
|
||||
|
||||
export TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY=$(openssl rand -hex 32)
|
||||
}
|
||||
|
||||
|
||||
export tllm_mpi_size=$(mpi_world_size)
|
||||
log_stderr "tllm_mpi_size: $tllm_mpi_size"
|
||||
|
||||
export_free_tcp_addr_for_spawn_proxy_process
|
||||
|
||||
if [ -z "$mpi_rank" ] || [ "$mpi_rank" -eq 0 ]; then
|
||||
log_stderr "rank${mpi_rank} run ${task_with_command[@]} in background"
|
||||
|
||||
# MPI doesn't allow spawn a process sharing the MPI environment in a MPI
|
||||
# process, or duplicate MPI_Init in the child process will cause undefined
|
||||
# behavior. Thus we need to clean the MPI environment in the parent process
|
||||
# before spawning the child process, and restore the MPI environment later
|
||||
# before running MPI operations in the parent process.
|
||||
mpi_blacklist=(
|
||||
OMPI_ PMIX_ PMI_ SLURM_ MPI_ UCX_
|
||||
I_MPI_ HYDRA_ KMP_ MPICH_ MV2_ CRAY_
|
||||
)
|
||||
|
||||
(
|
||||
# Remove MPI-related variables only in the subshell context
|
||||
for var in $(compgen -e); do
|
||||
for prefix in "${mpi_blacklist[@]}"; do
|
||||
if [[ "$var" == "$prefix"* ]]; then
|
||||
unset "$var"
|
||||
break
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
# Turn off "exit on error" so the following lines always run
|
||||
set +e
|
||||
|
||||
# Execute the task with cleaned environment
|
||||
"${task_with_command[@]}"
|
||||
task_exit_code=$?
|
||||
echo "Task exit code: $task_exit_code"
|
||||
|
||||
# Stop the MPI Comm server
|
||||
python3 -m tensorrt_llm.llmapi.mgmn_leader_node --action stop
|
||||
mpi_exit_code=$?
|
||||
echo "MPI Comm server exit code: $mpi_exit_code"
|
||||
|
||||
# Propagate task exit status
|
||||
if [ $task_exit_code -ne 0 ]; then
|
||||
exit $task_exit_code
|
||||
else
|
||||
exit $mpi_exit_code
|
||||
fi
|
||||
) 1>&2 &
|
||||
|
||||
# Turn off "exit on error" so the following lines always run
|
||||
set +e
|
||||
|
||||
# Capture subshell PID
|
||||
subshell_pid=$!
|
||||
echo "Subshell PID: $subshell_pid"
|
||||
|
||||
log_stderr "rank${mpi_rank} run mgmn leader node with mpi_world_size: $(mpi_world_size) ..."
|
||||
log_stderr "rank0 host: $HOSTNAME"
|
||||
python3 -m tensorrt_llm.llmapi.mgmn_leader_node
|
||||
mgmn_leader_node_exit_code=$?
|
||||
echo "MGMN leader node exit code: $mgmn_leader_node_exit_code"
|
||||
|
||||
# Wait for subshell
|
||||
wait $subshell_pid
|
||||
# This is subshell's exit code
|
||||
subshell_exit_code=$?
|
||||
echo "Subshell exit code: $subshell_exit_code"
|
||||
|
||||
# Propagate subshell exit status
|
||||
if [ $subshell_exit_code -ne 0 ]; then
|
||||
exit $subshell_exit_code
|
||||
else
|
||||
exit $mgmn_leader_node_exit_code
|
||||
fi
|
||||
else
|
||||
# Turn off "exit on error" so the following lines always run
|
||||
set +e
|
||||
|
||||
log_stderr "rank${mpi_rank} run mgmn worker node with mpi_world_size: $(mpi_world_size) ..."
|
||||
python3 -m tensorrt_llm.llmapi.mgmn_worker_node
|
||||
mgmn_worker_node_exit_code=$?
|
||||
echo "MGMN worker node exit code: $mgmn_worker_node_exit_code"
|
||||
|
||||
exit $mgmn_worker_node_exit_code
|
||||
fi
|
||||
Loading…
Reference in New Issue
Block a user