mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[None][chore] Add sample yaml for wide-ep example and minor fixes (#8825)
Signed-off-by: Zero Zeng <38289304+zerollzeng@users.noreply.github.com> Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Co-authored-by: Zero Zeng <38289304+zerollzeng@users.noreply.github.com>
This commit is contained in:
parent
89336fbf07
commit
db2a42f641
@ -5,7 +5,7 @@ slurm:
|
||||
account: "<account>"
|
||||
job_time: "02:00:00"
|
||||
job_name: "<job_name>"
|
||||
numa_bind: true
|
||||
numa_bind: true # Only enable for GB200 NVL72
|
||||
|
||||
# Benchmark Mode
|
||||
benchmark:
|
||||
@ -42,7 +42,6 @@ profiling:
|
||||
nsys_on: false # Set to true to enable profiling
|
||||
|
||||
worker_config:
|
||||
eplb_num_slots: 0 # Number of slots for EPLB
|
||||
gen:
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
@ -77,6 +76,8 @@ worker_config:
|
||||
moe_config:
|
||||
backend: CUTLASS
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 0
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 4608
|
||||
backend: DEFAULT
|
||||
|
||||
@ -7,8 +7,8 @@ gpus_per_node=${1}
|
||||
numa_bind=${2}
|
||||
ctx_nodes=${3} # Number of nodes needed for ctx workers
|
||||
gen_nodes=${4} # Number of nodes needed for gen workers
|
||||
ctx_tp_size=${5} # Tensor parallel size for ctx workers
|
||||
gen_tp_size=${6} # Tensor parallel size for gen workers
|
||||
ctx_world_size=${5} # World size for ctx workers
|
||||
gen_world_size=${6} # World size for gen workers
|
||||
|
||||
# Worker configuration
|
||||
num_ctx_servers=${7}
|
||||
@ -47,8 +47,8 @@ echo " gpus_per_node: ${gpus_per_node}"
|
||||
echo " numa_bind: ${numa_bind}"
|
||||
echo " ctx_nodes: ${ctx_nodes}"
|
||||
echo " gen_nodes: ${gen_nodes}"
|
||||
echo " ctx_tp_size: ${ctx_tp_size}"
|
||||
echo " gen_tp_size: ${gen_tp_size}"
|
||||
echo " ctx_world_size: ${ctx_world_size}"
|
||||
echo " gen_world_size: ${gen_world_size}"
|
||||
echo
|
||||
echo "Worker Configuration:"
|
||||
echo " num_ctx_servers: ${num_ctx_servers}"
|
||||
@ -123,7 +123,7 @@ if [ -d "${trtllm_repo}" ]; then
|
||||
|
||||
echo "Installing TensorRT-LLM..."
|
||||
if ! srun --container-name=${container_name} \
|
||||
--container-mounts=${container_mount} \
|
||||
--container-mounts=${container_mount} --no-container-mount-home \
|
||||
--mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
|
||||
bash -c "cd ${trtllm_repo} && pip install -e ." \
|
||||
&> ${full_logdir}/install.log; then
|
||||
@ -167,7 +167,7 @@ echo "ctx_nodes_num_in_single_server: ${ctx_nodes_num_in_single_server}"
|
||||
echo "Starting gen workers..."
|
||||
for i in $(seq 0 $((num_gen_servers - 1))); do
|
||||
srun -l -N ${gen_nodes_num_in_single_server} \
|
||||
--ntasks=${gen_tp_size} \
|
||||
--ntasks=$((gen_world_size)) \
|
||||
--ntasks-per-node=${gpus_per_node} \
|
||||
--container-image=${container_image} \
|
||||
--container-name=${container_name} \
|
||||
@ -182,7 +182,7 @@ done
|
||||
echo "Starting ctx workers..."
|
||||
for i in $(seq 0 $((num_ctx_servers - 1))); do
|
||||
srun -l -N ${ctx_nodes_num_in_single_server} \
|
||||
--ntasks=${ctx_tp_size} \
|
||||
--ntasks=$((ctx_world_size)) \
|
||||
--ntasks-per-node=${gpus_per_node} \
|
||||
--container-image=${container_image} \
|
||||
--container-name=${container_name} \
|
||||
|
||||
@ -39,9 +39,9 @@ def save_worker_config(config, output_path, worker_type):
|
||||
yaml.dump(worker_config, f, default_flow_style=False)
|
||||
|
||||
|
||||
def calculate_nodes(tp_size, num_servers, gpus_per_node):
|
||||
"""Calculate required nodes based on tensor parallel size and server count."""
|
||||
return (tp_size + gpus_per_node - 1) // gpus_per_node * num_servers
|
||||
def calculate_nodes(world_size, num_servers, gpus_per_node):
|
||||
"""Calculate required nodes based on world size and server count."""
|
||||
return (world_size + gpus_per_node - 1) // gpus_per_node * num_servers
|
||||
|
||||
|
||||
def submit_job(config):
|
||||
@ -50,10 +50,6 @@ def submit_job(config):
|
||||
hw_config = config['hardware']
|
||||
env_config = config['environment']
|
||||
|
||||
# Calculate nodes based on tensor parallel sizes
|
||||
ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size']
|
||||
gen_tp_size = config['worker_config']['gen']['tensor_parallel_size']
|
||||
|
||||
# Get number of servers from config
|
||||
ctx_num = hw_config['num_ctx_servers']
|
||||
gen_num = hw_config['num_gen_servers']
|
||||
@ -63,9 +59,16 @@ def submit_job(config):
|
||||
mtp_size = gen_config.get('speculative_config',
|
||||
{}).get('num_nextn_predict_layers', 0)
|
||||
|
||||
ctx_nodes = calculate_nodes(ctx_tp_size, ctx_num,
|
||||
# Calculate nodes based on world sizes
|
||||
ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size']
|
||||
ctx_pp_size = config['worker_config']['ctx']['pipeline_parallel_size']
|
||||
ctx_world_size = ctx_tp_size * ctx_pp_size
|
||||
ctx_nodes = calculate_nodes(ctx_world_size, ctx_num,
|
||||
hw_config['gpus_per_node'])
|
||||
gen_nodes = calculate_nodes(gen_tp_size, gen_num,
|
||||
gen_tp_size = config['worker_config']['gen']['tensor_parallel_size']
|
||||
gen_pp_size = config['worker_config']['gen']['pipeline_parallel_size']
|
||||
gen_world_size = gen_tp_size * gen_pp_size
|
||||
gen_nodes = calculate_nodes(gen_world_size, gen_num,
|
||||
hw_config['gpus_per_node'])
|
||||
total_nodes = ctx_nodes + gen_nodes
|
||||
total_tasks = total_nodes * hw_config['gpus_per_node']
|
||||
@ -82,9 +85,9 @@ def submit_job(config):
|
||||
|
||||
# Determine directory suffix based on attention_dp
|
||||
if gen_enable_attention_dp:
|
||||
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['eplb_num_slots']}_mtp{mtp_size}"
|
||||
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['gen']['moe_config']['load_balancer']['num_slots']}_mtp{mtp_size}"
|
||||
else:
|
||||
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['eplb_num_slots']}_mtp{mtp_size}"
|
||||
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['gen']['moe_config']['load_balancer']['num_slots']}_mtp{mtp_size}"
|
||||
|
||||
# Create full log directory path
|
||||
log_dir = os.path.join(log_base, dir_suffix)
|
||||
@ -114,8 +117,8 @@ def submit_job(config):
|
||||
str(slurm_config['numa_bind']).lower(),
|
||||
str(ctx_nodes), # Number of nodes needed for ctx workers
|
||||
str(gen_nodes), # Number of nodes needed for gen workers
|
||||
str(ctx_tp_size), # Tensor parallel size for ctx workers
|
||||
str(gen_tp_size), # Tensor parallel size for gen workers
|
||||
str(ctx_world_size), # World size for ctx workers
|
||||
str(gen_world_size), # World size for gen workers
|
||||
|
||||
# Worker configuration
|
||||
str(ctx_num),
|
||||
|
||||
@ -34,6 +34,7 @@ Before running the scripts, ensure you have:
|
||||
### Run Benchmarks
|
||||
|
||||
```bash
|
||||
# Please find the `submit.py` script and an example `config.yaml` in the `examples/disaggregated/slurm/benchmark/` directory.
|
||||
python3 submit.py -c your_config.yaml
|
||||
# Please find the `submit.py` script in the `examples/disaggregated/slurm/benchmark/` directory.
|
||||
# An example `config.yaml` for wide EP: `examples/wide_ep/slurm_scripts/config.yaml`.
|
||||
python3 submit.py -c config.yaml
|
||||
```
|
||||
|
||||
113
examples/wide_ep/slurm_scripts/config.yaml
Normal file
113
examples/wide_ep/slurm_scripts/config.yaml
Normal file
@ -0,0 +1,113 @@
|
||||
# SLURM Configuration
|
||||
slurm:
|
||||
script_file: "disaggr_torch.slurm"
|
||||
partition: "<partition>"
|
||||
account: "<account>"
|
||||
job_time: "02:00:00"
|
||||
job_name: "<job_name>"
|
||||
numa_bind: true # Only enable for GB200 NVL72
|
||||
|
||||
# Hardware Configuration
|
||||
hardware:
|
||||
gpus_per_node: 4 # Modify this with your hardware configuration
|
||||
num_ctx_servers: 2 # Number of context servers
|
||||
num_gen_servers: 1 # Number of generation servers
|
||||
|
||||
# Benchmark Mode
|
||||
benchmark:
|
||||
mode: "e2e" # Options: e2e, gen_only
|
||||
use_nv_sa_benchmark: false # Whether to use NVIDIA SA benchmark script
|
||||
multi_round: 1 # Number of benchmark rounds
|
||||
benchmark_ratio: 0.8 # Benchmark ratio
|
||||
streaming: true # Enable streaming mode
|
||||
concurrency_list: "1024"
|
||||
|
||||
# Sequence Configuration
|
||||
sequence:
|
||||
input_length: 8196 # Input sequence length
|
||||
output_length: 1024 # Output sequence length
|
||||
|
||||
# Environment Configuration
|
||||
environment:
|
||||
container_mount: "<container_mount>" # Format: path1:path1,path2:path2
|
||||
container_image: "<container_image>"
|
||||
model_path: "<model_path>"
|
||||
trtllm_repo: "<trtllm_repo>"
|
||||
build_wheel: false # Don't build the wheel when launching multiple jobs
|
||||
dataset_file: "<dataset_file>"
|
||||
work_dir: "<full_path_to_work_dir>"
|
||||
|
||||
# Profiling Configuration
|
||||
profiling:
|
||||
nsys_on: false # Set to true to enable profiling
|
||||
|
||||
# Worker Configuration
|
||||
worker_config:
|
||||
gen:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 128
|
||||
max_num_tokens: 512
|
||||
max_seq_len: 9236
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
- 768
|
||||
- 1024
|
||||
- 2048
|
||||
- 128
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: WIDEEP
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 8448
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 8448
|
||||
max_seq_len: 8212
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.75
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 8448
|
||||
backend: DEFAULT
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
Loading…
Reference in New Issue
Block a user