[None][chore] Add sample yaml for wide-ep example and minor fixes (#8825)

Signed-off-by: Zero Zeng <38289304+zerollzeng@users.noreply.github.com>
Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Co-authored-by: Zero Zeng <38289304+zerollzeng@users.noreply.github.com>
This commit is contained in:
Kaiyu Xie 2025-11-03 23:48:34 +08:00 committed by GitHub
parent 89336fbf07
commit db2a42f641
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 142 additions and 24 deletions

View File

@ -5,7 +5,7 @@ slurm:
account: "<account>"
job_time: "02:00:00"
job_name: "<job_name>"
numa_bind: true
numa_bind: true # Only enable for GB200 NVL72
# Benchmark Mode
benchmark:
@ -42,7 +42,6 @@ profiling:
nsys_on: false # Set to true to enable profiling
worker_config:
eplb_num_slots: 0 # Number of slots for EPLB
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
@ -77,6 +76,8 @@ worker_config:
moe_config:
backend: CUTLASS
use_low_precision_moe_combine: true
load_balancer:
num_slots: 0
cache_transceiver_config:
max_tokens_in_buffer: 4608
backend: DEFAULT

View File

@ -7,8 +7,8 @@ gpus_per_node=${1}
numa_bind=${2}
ctx_nodes=${3} # Number of nodes needed for ctx workers
gen_nodes=${4} # Number of nodes needed for gen workers
ctx_tp_size=${5} # Tensor parallel size for ctx workers
gen_tp_size=${6} # Tensor parallel size for gen workers
ctx_world_size=${5} # World size for ctx workers
gen_world_size=${6} # World size for gen workers
# Worker configuration
num_ctx_servers=${7}
@ -47,8 +47,8 @@ echo " gpus_per_node: ${gpus_per_node}"
echo " numa_bind: ${numa_bind}"
echo " ctx_nodes: ${ctx_nodes}"
echo " gen_nodes: ${gen_nodes}"
echo " ctx_tp_size: ${ctx_tp_size}"
echo " gen_tp_size: ${gen_tp_size}"
echo " ctx_world_size: ${ctx_world_size}"
echo " gen_world_size: ${gen_world_size}"
echo
echo "Worker Configuration:"
echo " num_ctx_servers: ${num_ctx_servers}"
@ -123,7 +123,7 @@ if [ -d "${trtllm_repo}" ]; then
echo "Installing TensorRT-LLM..."
if ! srun --container-name=${container_name} \
--container-mounts=${container_mount} \
--container-mounts=${container_mount} --no-container-mount-home \
--mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
bash -c "cd ${trtllm_repo} && pip install -e ." \
&> ${full_logdir}/install.log; then
@ -167,7 +167,7 @@ echo "ctx_nodes_num_in_single_server: ${ctx_nodes_num_in_single_server}"
echo "Starting gen workers..."
for i in $(seq 0 $((num_gen_servers - 1))); do
srun -l -N ${gen_nodes_num_in_single_server} \
--ntasks=${gen_tp_size} \
--ntasks=$((gen_world_size)) \
--ntasks-per-node=${gpus_per_node} \
--container-image=${container_image} \
--container-name=${container_name} \
@ -182,7 +182,7 @@ done
echo "Starting ctx workers..."
for i in $(seq 0 $((num_ctx_servers - 1))); do
srun -l -N ${ctx_nodes_num_in_single_server} \
--ntasks=${ctx_tp_size} \
--ntasks=$((ctx_world_size)) \
--ntasks-per-node=${gpus_per_node} \
--container-image=${container_image} \
--container-name=${container_name} \

View File

@ -39,9 +39,9 @@ def save_worker_config(config, output_path, worker_type):
yaml.dump(worker_config, f, default_flow_style=False)
def calculate_nodes(tp_size, num_servers, gpus_per_node):
"""Calculate required nodes based on tensor parallel size and server count."""
return (tp_size + gpus_per_node - 1) // gpus_per_node * num_servers
def calculate_nodes(world_size, num_servers, gpus_per_node):
"""Calculate required nodes based on world size and server count."""
return (world_size + gpus_per_node - 1) // gpus_per_node * num_servers
def submit_job(config):
@ -50,10 +50,6 @@ def submit_job(config):
hw_config = config['hardware']
env_config = config['environment']
# Calculate nodes based on tensor parallel sizes
ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size']
gen_tp_size = config['worker_config']['gen']['tensor_parallel_size']
# Get number of servers from config
ctx_num = hw_config['num_ctx_servers']
gen_num = hw_config['num_gen_servers']
@ -63,9 +59,16 @@ def submit_job(config):
mtp_size = gen_config.get('speculative_config',
{}).get('num_nextn_predict_layers', 0)
ctx_nodes = calculate_nodes(ctx_tp_size, ctx_num,
# Calculate nodes based on world sizes
ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size']
ctx_pp_size = config['worker_config']['ctx']['pipeline_parallel_size']
ctx_world_size = ctx_tp_size * ctx_pp_size
ctx_nodes = calculate_nodes(ctx_world_size, ctx_num,
hw_config['gpus_per_node'])
gen_nodes = calculate_nodes(gen_tp_size, gen_num,
gen_tp_size = config['worker_config']['gen']['tensor_parallel_size']
gen_pp_size = config['worker_config']['gen']['pipeline_parallel_size']
gen_world_size = gen_tp_size * gen_pp_size
gen_nodes = calculate_nodes(gen_world_size, gen_num,
hw_config['gpus_per_node'])
total_nodes = ctx_nodes + gen_nodes
total_tasks = total_nodes * hw_config['gpus_per_node']
@ -82,9 +85,9 @@ def submit_job(config):
# Determine directory suffix based on attention_dp
if gen_enable_attention_dp:
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['eplb_num_slots']}_mtp{mtp_size}"
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['gen']['moe_config']['load_balancer']['num_slots']}_mtp{mtp_size}"
else:
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['eplb_num_slots']}_mtp{mtp_size}"
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['gen']['moe_config']['load_balancer']['num_slots']}_mtp{mtp_size}"
# Create full log directory path
log_dir = os.path.join(log_base, dir_suffix)
@ -114,8 +117,8 @@ def submit_job(config):
str(slurm_config['numa_bind']).lower(),
str(ctx_nodes), # Number of nodes needed for ctx workers
str(gen_nodes), # Number of nodes needed for gen workers
str(ctx_tp_size), # Tensor parallel size for ctx workers
str(gen_tp_size), # Tensor parallel size for gen workers
str(ctx_world_size), # World size for ctx workers
str(gen_world_size), # World size for gen workers
# Worker configuration
str(ctx_num),

View File

@ -34,6 +34,7 @@ Before running the scripts, ensure you have:
### Run Benchmarks
```bash
# Please find the `submit.py` script and an example `config.yaml` in the `examples/disaggregated/slurm/benchmark/` directory.
python3 submit.py -c your_config.yaml
# Please find the `submit.py` script in the `examples/disaggregated/slurm/benchmark/` directory.
# An example `config.yaml` for wide EP: `examples/wide_ep/slurm_scripts/config.yaml`.
python3 submit.py -c config.yaml
```

View File

@ -0,0 +1,113 @@
# SLURM Configuration
slurm:
script_file: "disaggr_torch.slurm"
partition: "<partition>"
account: "<account>"
job_time: "02:00:00"
job_name: "<job_name>"
numa_bind: true # Only enable for GB200 NVL72
# Hardware Configuration
hardware:
gpus_per_node: 4 # Modify this with your hardware configuration
num_ctx_servers: 2 # Number of context servers
num_gen_servers: 1 # Number of generation servers
# Benchmark Mode
benchmark:
mode: "e2e" # Options: e2e, gen_only
use_nv_sa_benchmark: false # Whether to use NVIDIA SA benchmark script
multi_round: 1 # Number of benchmark rounds
benchmark_ratio: 0.8 # Benchmark ratio
streaming: true # Enable streaming mode
concurrency_list: "1024"
# Sequence Configuration
sequence:
input_length: 8196 # Input sequence length
output_length: 1024 # Output sequence length
# Environment Configuration
environment:
container_mount: "<container_mount>" # Format: path1:path1,path2:path2
container_image: "<container_image>"
model_path: "<model_path>"
trtllm_repo: "<trtllm_repo>"
build_wheel: false # Don't build the wheel when launching multiple jobs
dataset_file: "<dataset_file>"
work_dir: "<full_path_to_work_dir>"
# Profiling Configuration
profiling:
nsys_on: false # Set to true to enable profiling
# Worker Configuration
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
pipeline_parallel_size: 1
max_batch_size: 128
max_num_tokens: 512
max_seq_len: 9236
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
- 768
- 1024
- 2048
- 128
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: WIDEEP
use_low_precision_moe_combine: true
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 1
max_num_tokens: 8448
max_seq_len: 8212
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: true
pipeline_parallel_size: 1
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.75
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: DEFAULT
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3