mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Co-authored-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
109 lines
2.7 KiB
Bash
109 lines
2.7 KiB
Bash
#!/bin/bash
|
|
# cleanup_jobs.sh - Cancel all SLURM jobs tracked in jobs.txt
|
|
#
|
|
# This script is designed to run in GitLab CI after_script to ensure
|
|
# all SLURM jobs are cancelled when the pipeline is interrupted, cancelled,
|
|
# or times out.
|
|
#
|
|
# Usage:
|
|
# bash cleanup_jobs.sh
|
|
#
|
|
# Environment variables:
|
|
# OUTPUT_PATH: Directory containing jobs.txt and pytest.pid
|
|
|
|
set -e
|
|
|
|
OUTPUT_PATH="${OUTPUT_PATH:-/tmp}"
|
|
JOBS_FILE="${OUTPUT_PATH}/jobs.txt"
|
|
PID_FILE="${OUTPUT_PATH}/pytest.pid"
|
|
|
|
echo "=========================================="
|
|
echo "SLURM Job Cleanup Script"
|
|
echo "=========================================="
|
|
echo "Output path: $OUTPUT_PATH"
|
|
echo ""
|
|
|
|
# Show pytest PID if available (for debugging)
|
|
if [ -f "$PID_FILE" ]; then
|
|
PYTEST_PID=$(cat "$PID_FILE" | tr -d '\n')
|
|
echo "Pytest PID: $PYTEST_PID"
|
|
|
|
# Check if pytest is still running
|
|
if kill -0 "$PYTEST_PID" 2>/dev/null; then
|
|
echo "Status: Still running"
|
|
else
|
|
echo "Status: Already terminated"
|
|
fi
|
|
echo ""
|
|
else
|
|
echo "No pytest.pid found (test may not have started)"
|
|
echo ""
|
|
fi
|
|
|
|
# Check if jobs.txt exists
|
|
if [ ! -f "$JOBS_FILE" ]; then
|
|
echo "[WARN] No jobs.txt found"
|
|
echo " Nothing to cancel"
|
|
echo "=========================================="
|
|
exit 0
|
|
fi
|
|
|
|
echo "[INFO] Reading jobs from: $JOBS_FILE"
|
|
|
|
# Read, deduplicate, and filter empty lines
|
|
JOBS=$(sort -u "$JOBS_FILE" | grep -v '^$' || true)
|
|
|
|
if [ -z "$JOBS" ]; then
|
|
echo "[WARN] jobs.txt is empty"
|
|
echo " Nothing to cancel"
|
|
echo "=========================================="
|
|
exit 0
|
|
fi
|
|
|
|
JOB_COUNT=$(echo "$JOBS" | wc -l)
|
|
echo "Found $JOB_COUNT job(s) to cancel"
|
|
echo ""
|
|
|
|
# Cancel each job
|
|
CANCELLED=0
|
|
ALREADY_DONE=0
|
|
FAILED=0
|
|
|
|
echo "Cancelling jobs..."
|
|
while IFS= read -r job_id; do
|
|
if [ -n "$job_id" ]; then
|
|
printf " %-12s ... " "$job_id"
|
|
|
|
# Try to cancel the job
|
|
if scancel "$job_id" 2>/dev/null; then
|
|
echo "[OK] Cancelled"
|
|
CANCELLED=$((CANCELLED + 1))
|
|
else
|
|
# Check if job exists in squeue
|
|
if squeue -j "$job_id" -h 2>/dev/null | grep -q "$job_id"; then
|
|
echo "[FAIL] Failed to cancel"
|
|
FAILED=$((FAILED + 1))
|
|
else
|
|
echo "[SKIP] Already finished"
|
|
ALREADY_DONE=$((ALREADY_DONE + 1))
|
|
fi
|
|
fi
|
|
fi
|
|
done <<< "$JOBS"
|
|
|
|
echo ""
|
|
echo "=========================================="
|
|
echo "[DONE] Cleanup completed"
|
|
echo " Total: $JOB_COUNT"
|
|
echo " Cancelled: $CANCELLED"
|
|
echo " Already done: $ALREADY_DONE"
|
|
echo " Failed: $FAILED"
|
|
echo "=========================================="
|
|
|
|
# Exit with error if any cancellation actually failed
|
|
if [ $FAILED -gt 0 ]; then
|
|
exit 1
|
|
fi
|
|
|
|
exit 0
|