TensorRT-LLMs/examples/disaggregated/slurm/benchmark/wait_server.sh
Kaiyu Xie 5a611cb8f5
[None] [feat] Enhancements to slurm scripts (#10112)
Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
2025-12-21 10:24:56 -05:00

34 lines
887 B
Bash

#!/bin/bash
set -euo pipefail
# Parse arguments
hostname=$1
port=$2
# Constants for health check
readonly TIMEOUT=1800 # 30 minutes
readonly HEALTH_CHECK_INTERVAL=10
readonly STATUS_UPDATE_INTERVAL=30
# Wait for server to be healthy
echo "Waiting for server ${hostname}:${port} to be healthy..."
start_time=$(date +%s)
while ! curl -s -o /dev/null -w "%{http_code}" "http://${hostname}:${port}/health" > /dev/null 2>&1; do
current_time=$(date +%s)
elapsed=$((current_time - start_time))
if [ $elapsed -ge $TIMEOUT ]; then
echo "Error: Server not healthy after ${TIMEOUT} seconds"
exit 1
fi
if [ $((elapsed % STATUS_UPDATE_INTERVAL)) -eq 0 ] && [ $elapsed -gt 0 ]; then
echo "Waiting for server to be healthy... (${elapsed}s elapsed)"
fi
sleep $HEALTH_CHECK_INTERVAL
done
echo "Server is healthy and ready to accept requests!"