[None] [chore] cherry pick changes on slurm scripts from release/1.1.0rc2 (#7750)

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
This commit is contained in:
Kaiyu Xie 2025-09-16 16:07:13 +08:00 committed by GitHub
parent b278d06481
commit 6eef19297f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 24 additions and 12 deletions

View File

@ -83,8 +83,6 @@ echo "ntasks_per_node: ${ntasks_per_node}"
echo "===========================================" echo "==========================================="
nsys_on=""
# nsys_on=${full_logdir} # Uncomment this line to enable Nsys profiling
numa_bind=true # Only allocate memory from nodes, this only works on GB200 numa_bind=true # Only allocate memory from nodes, this only works on GB200
ctx_max_seq_len=$((isl + 10)) ctx_max_seq_len=$((isl + 10))
gen_max_seq_len=$((isl + osl + 10)) gen_max_seq_len=$((isl + osl + 10))
@ -96,6 +94,9 @@ logdir=${workdir}/slurm-${SLURM_JOB_ID}/benchmark-${isl}-${osl}
mkdir -p ${logdir} mkdir -p ${logdir}
full_logdir=${logdir}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size} full_logdir=${logdir}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
nsys_on=""
# nsys_on=${full_logdir} # Uncomment this line to enable Nsys profiling
echo "concurrency: ${concurrency}" echo "concurrency: ${concurrency}"
ctx_gpus=$((num_ctx_servers * ctx_tp_size * ctx_pp_size)) ctx_gpus=$((num_ctx_servers * ctx_tp_size * ctx_pp_size))

View File

@ -48,6 +48,11 @@ def gen_config_file(work_dir: str,
server_port: Server port server_port: Server port
""" """
ctx_config = { ctx_config = {
'build_config': {
'max_batch_size': ctx_batch_size,
'max_num_tokens': ctx_max_num_tokens,
'max_seq_len': ctx_max_seq_len,
},
'max_batch_size': ctx_batch_size, 'max_batch_size': ctx_batch_size,
'max_num_tokens': ctx_max_num_tokens, 'max_num_tokens': ctx_max_num_tokens,
'max_seq_len': ctx_max_seq_len, 'max_seq_len': ctx_max_seq_len,
@ -79,6 +84,11 @@ def gen_config_file(work_dir: str,
gen_moe_backend = "TRTLLM" gen_moe_backend = "TRTLLM"
gen_config = { gen_config = {
'build_config': {
'max_batch_size': gen_batch_size,
'max_num_tokens': gen_max_num_tokens,
'max_seq_len': gen_max_seq_len,
},
'tensor_parallel_size': gen_tp_size, 'tensor_parallel_size': gen_tp_size,
'moe_expert_parallel_size': gen_tp_size, 'moe_expert_parallel_size': gen_tp_size,
'enable_attention_dp': True if gen_enable_attention_dp else False, 'enable_attention_dp': True if gen_enable_attention_dp else False,

View File

@ -65,14 +65,14 @@ else
nsys_file=${nsys_folder}/nsys_worker_proc_${instance_id}_${SLURM_PROCID} nsys_file=${nsys_folder}/nsys_worker_proc_${instance_id}_${SLURM_PROCID}
export TLLM_PROFILE_RECORD_GC=1 export TLLM_PROFILE_RECORD_GC=1
export TLLM_NVTX_DEBUG=1 export TLLM_NVTX_DEBUG=1
if [ "${role}" = "GEN" ]; then if [ "${role}" = "GEN" ] && [ "$SLURM_PROCID" = "0" ]; then
export TLLM_PROFILE_START_STOP=200-250 export TLLM_PROFILE_START_STOP=200-250
nsys_prefix="nsys profile -e \"NSYS_MPI_STORE_TEAMS_PER_RANK=1\" -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none" nsys_prefix="nsys profile -e \"NSYS_MPI_STORE_TEAMS_PER_RANK=1\" -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none"
echo "nsys_prefix: ${nsys_prefix}" echo "nsys_prefix: ${nsys_prefix}"
elif [ "${role}" = "CTX" ]; then elif [ "${role}" = "CTX" ]; then
echo "nsys is not enabled on ctx_gpus" echo "nsys is not enabled on ctx_gpus"
fi fi
trtllm-llmapi-launch ${numa_bind_cmd} ${nsys_prefix} \ ${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \
trtllm-serve ${model_path} \ trtllm-serve ${model_path} \
--host $(hostname) --port ${port} \ --host $(hostname) --port ${port} \
--extra_llm_api_options ${config_file} --extra_llm_api_options ${config_file}

View File

@ -494,11 +494,14 @@ class RandomDataset(BenchmarkDataset):
# Filter out sequences that are too long or too short # Filter out sequences that are too long or too short
requests = [] requests = []
for prompt, initial_prompt_len, cached_token_ids in zip( dataset_len = len(dataset)
dataset, prompt_lengths, prompt_token_ids):
i = len(requests) for i in range(num_requests):
if i == num_requests: # Use modulo to cycle through the dataset when num_requests > dataset_len
break dataset_idx = i % dataset_len
prompt = dataset[dataset_idx]
initial_prompt_len = prompt_lengths[dataset_idx]
cached_token_ids = prompt_token_ids[dataset_idx]
# Skip empty prompt # Skip empty prompt
if initial_prompt_len == 0: if initial_prompt_len == 0:
@ -534,9 +537,6 @@ class RandomDataset(BenchmarkDataset):
prompt_len=total_input_len, prompt_len=total_input_len,
expected_output_len=int(output_lens[i]), expected_output_len=int(output_lens[i]),
)) ))
assert len(requests) == num_requests, (
f"Only {len(requests)} requests sampled from sharegpt dataset, {num_requests} requests are needed"
)
else: else:
for i in range(num_requests): for i in range(num_requests):
inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) % inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
@ -1131,6 +1131,7 @@ class VisionArenaDataset(HuggingFaceDataset):
if parser_fn is None: if parser_fn is None:
raise ValueError(f"Unsupported dataset path: {self.dataset_path}") raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
sampled_requests = []
for item in self.data: for item in self.data:
if len(prompts) >= num_requests: if len(prompts) >= num_requests:
break break