[None][fix] Fix copy start_logs in disagg slurm scripts (#10840)

Signed-off-by: Xianjie <5410381+qiaoxj07@users.noreply.github.com>
This commit is contained in:
Xianjie Qiao 2026-01-21 13:31:25 +08:00 committed by GitHub
parent 9116dfbacd
commit 87073d1ce4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 37 additions and 11 deletions

View File

@ -99,11 +99,20 @@ do_process_all_logs(){
fi
fi
done
if [ "${mode}" = "clean" ]; then
if [ -d "${tmp_start_logs}" ]; then
mkdir -p ${log_path}/start_logs
cp ${tmp_start_logs}/3_output_CTX_*.log ${log_path}/start_logs/ 2>/dev/null || true
cp ${tmp_start_logs}/3_output_GEN_*.log ${log_path}/start_logs/ 2>/dev/null || true
rm -rf ${tmp_start_logs}
fi
fi
}
mkdir -p ${log_path}/start_logs
cp ${log_path}/3_output_CTX_*.log ${log_path}/start_logs/ 2>/dev/null || true
cp ${log_path}/3_output_GEN_*.log ${log_path}/start_logs/ 2>/dev/null || true
tmp_start_logs=/tmp/${SLURM_JOB_ID}/start_logs
mkdir -p ${tmp_start_logs}
cp ${log_path}/3_output_CTX_*.log ${tmp_start_logs}/ 2>/dev/null || true
cp ${log_path}/3_output_GEN_*.log ${tmp_start_logs}/ 2>/dev/null || true
# warmup requests for ucx connections
if [ "${ucx_warmup_requests}" -gt 0 ]; then

View File

@ -91,9 +91,9 @@ do_process_all_logs(){
local gen_num
local line_count
local start_line
for ctx_log in ${input_folder}/output_ctx_*.log; do
for ctx_log in ${input_folder}/3_output_CTX_*.log; do
if [ -f "${ctx_log}" ]; then
ctx_num=$(basename "${ctx_log}" | sed 's/output_ctx_\([0-9]*\)\.log/\1/')
ctx_num=$(basename "${ctx_log}" | sed 's/3_output_CTX_\([0-9]*\)\.log/\1/')
if [ "${mode}" = "line" ]; then
line_count=$(wc -l < ${ctx_log})
echo ${line_count} > ${output_folder}/ctx_only_line_${ctx_num}.txt
@ -111,9 +111,9 @@ do_process_all_logs(){
fi
done
# process all the gen log files in the input folder
for gen_log in ${input_folder}/output_gen_*.log; do
for gen_log in ${input_folder}/3_output_GEN_*.log; do
if [ -f "${gen_log}" ]; then
gen_num=$(basename "${gen_log}" | sed 's/output_gen_\([0-9]*\)\.log/\1/')
gen_num=$(basename "${gen_log}" | sed 's/3_output_GEN_\([0-9]*\)\.log/\1/')
if [ "${mode}" = "line" ]; then
line_count=$(wc -l < ${gen_log})
echo ${line_count} > ${output_folder}/gen_only_line_${gen_num}.txt
@ -130,11 +130,20 @@ do_process_all_logs(){
fi
fi
done
if [ "${mode}" = "clean" ]; then
if [ -d "${tmp_start_logs}" ]; then
mkdir -p ${log_path}/start_logs
cp ${tmp_start_logs}/3_output_CTX_*.log ${log_path}/start_logs/ 2>/dev/null || true
cp ${tmp_start_logs}/3_output_GEN_*.log ${log_path}/start_logs/ 2>/dev/null || true
rm -rf ${tmp_start_logs}
fi
fi
}
mkdir -p ${log_path}/start_logs
cp ${log_path}/output_ctx_*.log ${log_path}/start_logs/ 2>/dev/null || true
cp ${log_path}/output_gen_*.log ${log_path}/start_logs/ 2>/dev/null || true
tmp_start_logs=/tmp/${SLURM_JOB_ID}/start_logs
mkdir -p ${tmp_start_logs}
cp ${log_path}/3_output_CTX_*.log ${tmp_start_logs}/ 2>/dev/null || true
cp ${log_path}/3_output_GEN_*.log ${tmp_start_logs}/ 2>/dev/null || true
# warmup requests for ucx connections
if [ "${ucx_warmup_requests}" -gt 0 ]; then
@ -160,8 +169,8 @@ for concurrency in ${concurrency_list}; do
num_prompts=$((concurrency * multi_round))
output_dir="${log_path}/concurrency_${concurrency}"
echo "Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts"
do_process_all_logs ${log_path}/ ${log_path}/concurrency_${concurrency} "line"
mkdir -p "${output_dir}"
do_process_all_logs ${log_path}/ ${log_path}/concurrency_${concurrency} "line"
python "${BENCH_SCRIPT}" \
--model "${model_name}" \

View File

@ -484,6 +484,14 @@ def submit_job(config, log_dir, dry_run):
f"&> {log_dir}/7_accuracy_eval_{task}.log"
]
client_cmds.append(" ".join(accuracy_prefix + accuracy_cmd))
# record ${SLURM_JOB_NODELIST} to ${log_dir}/8_done_job_id.txt
done_cmd = [
"echo", "${SLURM_JOB_NODELIST}", ">",
f"{log_dir}/8_done_${{SLURM_JOB_ID}}.txt"
]
client_cmds.append(" ".join(done_cmd))
with open(os.path.join(log_dir, "client_cmds.sh"), "w") as f:
f.write("\n".join(client_cmds) + "\n")