From 87073d1ce4897ef5b6ec59530b35f3fa8a3f3014 Mon Sep 17 00:00:00 2001 From: Xianjie Qiao <5410381+qiaoxj07@users.noreply.github.com> Date: Wed, 21 Jan 2026 13:31:25 +0800 Subject: [PATCH] [None][fix] Fix copy start_logs in disagg slurm scripts (#10840) Signed-off-by: Xianjie <5410381+qiaoxj07@users.noreply.github.com> --- .../slurm/benchmark/run_benchmark.sh | 15 ++++++++--- .../slurm/benchmark/run_benchmark_nv_sa.sh | 25 +++++++++++++------ .../disaggregated/slurm/benchmark/submit.py | 8 ++++++ 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/examples/disaggregated/slurm/benchmark/run_benchmark.sh b/examples/disaggregated/slurm/benchmark/run_benchmark.sh index 47c83cf4b7..69d6e7f84f 100644 --- a/examples/disaggregated/slurm/benchmark/run_benchmark.sh +++ b/examples/disaggregated/slurm/benchmark/run_benchmark.sh @@ -99,11 +99,20 @@ do_process_all_logs(){ fi fi done + if [ "${mode}" = "clean" ]; then + if [ -d "${tmp_start_logs}" ]; then + mkdir -p ${log_path}/start_logs + cp ${tmp_start_logs}/3_output_CTX_*.log ${log_path}/start_logs/ 2>/dev/null || true + cp ${tmp_start_logs}/3_output_GEN_*.log ${log_path}/start_logs/ 2>/dev/null || true + rm -rf ${tmp_start_logs} + fi + fi } -mkdir -p ${log_path}/start_logs -cp ${log_path}/3_output_CTX_*.log ${log_path}/start_logs/ 2>/dev/null || true -cp ${log_path}/3_output_GEN_*.log ${log_path}/start_logs/ 2>/dev/null || true +tmp_start_logs=/tmp/${SLURM_JOB_ID}/start_logs +mkdir -p ${tmp_start_logs} +cp ${log_path}/3_output_CTX_*.log ${tmp_start_logs}/ 2>/dev/null || true +cp ${log_path}/3_output_GEN_*.log ${tmp_start_logs}/ 2>/dev/null || true # warmup requests for ucx connections if [ "${ucx_warmup_requests}" -gt 0 ]; then diff --git a/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh b/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh index 8abcd7514d..b72a54f886 100644 --- a/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh +++ b/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh @@ -91,9 +91,9 @@ do_process_all_logs(){ local gen_num local line_count local start_line - for ctx_log in ${input_folder}/output_ctx_*.log; do + for ctx_log in ${input_folder}/3_output_CTX_*.log; do if [ -f "${ctx_log}" ]; then - ctx_num=$(basename "${ctx_log}" | sed 's/output_ctx_\([0-9]*\)\.log/\1/') + ctx_num=$(basename "${ctx_log}" | sed 's/3_output_CTX_\([0-9]*\)\.log/\1/') if [ "${mode}" = "line" ]; then line_count=$(wc -l < ${ctx_log}) echo ${line_count} > ${output_folder}/ctx_only_line_${ctx_num}.txt @@ -111,9 +111,9 @@ do_process_all_logs(){ fi done # process all the gen log files in the input folder - for gen_log in ${input_folder}/output_gen_*.log; do + for gen_log in ${input_folder}/3_output_GEN_*.log; do if [ -f "${gen_log}" ]; then - gen_num=$(basename "${gen_log}" | sed 's/output_gen_\([0-9]*\)\.log/\1/') + gen_num=$(basename "${gen_log}" | sed 's/3_output_GEN_\([0-9]*\)\.log/\1/') if [ "${mode}" = "line" ]; then line_count=$(wc -l < ${gen_log}) echo ${line_count} > ${output_folder}/gen_only_line_${gen_num}.txt @@ -130,11 +130,20 @@ do_process_all_logs(){ fi fi done + if [ "${mode}" = "clean" ]; then + if [ -d "${tmp_start_logs}" ]; then + mkdir -p ${log_path}/start_logs + cp ${tmp_start_logs}/3_output_CTX_*.log ${log_path}/start_logs/ 2>/dev/null || true + cp ${tmp_start_logs}/3_output_GEN_*.log ${log_path}/start_logs/ 2>/dev/null || true + rm -rf ${tmp_start_logs} + fi + fi } -mkdir -p ${log_path}/start_logs -cp ${log_path}/output_ctx_*.log ${log_path}/start_logs/ 2>/dev/null || true -cp ${log_path}/output_gen_*.log ${log_path}/start_logs/ 2>/dev/null || true +tmp_start_logs=/tmp/${SLURM_JOB_ID}/start_logs +mkdir -p ${tmp_start_logs} +cp ${log_path}/3_output_CTX_*.log ${tmp_start_logs}/ 2>/dev/null || true +cp ${log_path}/3_output_GEN_*.log ${tmp_start_logs}/ 2>/dev/null || true # warmup requests for ucx connections if [ "${ucx_warmup_requests}" -gt 0 ]; then @@ -160,8 +169,8 @@ for concurrency in ${concurrency_list}; do num_prompts=$((concurrency * multi_round)) output_dir="${log_path}/concurrency_${concurrency}" echo "Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts" - do_process_all_logs ${log_path}/ ${log_path}/concurrency_${concurrency} "line" mkdir -p "${output_dir}" + do_process_all_logs ${log_path}/ ${log_path}/concurrency_${concurrency} "line" python "${BENCH_SCRIPT}" \ --model "${model_name}" \ diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index 40d158e62d..a6fa8cc42c 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -484,6 +484,14 @@ def submit_job(config, log_dir, dry_run): f"&> {log_dir}/7_accuracy_eval_{task}.log" ] client_cmds.append(" ".join(accuracy_prefix + accuracy_cmd)) + + # record ${SLURM_JOB_NODELIST} to ${log_dir}/8_done_job_id.txt + done_cmd = [ + "echo", "${SLURM_JOB_NODELIST}", ">", + f"{log_dir}/8_done_${{SLURM_JOB_ID}}.txt" + ] + client_cmds.append(" ".join(done_cmd)) + with open(os.path.join(log_dir, "client_cmds.sh"), "w") as f: f.write("\n".join(client_cmds) + "\n")