TensorRT-LLMs/examples/wide_ep/slurm_scripts/config.yaml
Kaiyu Xie db2a42f641
[None][chore] Add sample yaml for wide-ep example and minor fixes (#8825)
Signed-off-by: Zero Zeng <38289304+zerollzeng@users.noreply.github.com>
Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Co-authored-by: Zero Zeng <38289304+zerollzeng@users.noreply.github.com>
2025-11-03 07:48:34 -08:00

114 lines
2.9 KiB
YAML

# SLURM Configuration
slurm:
script_file: "disaggr_torch.slurm"
partition: "<partition>"
account: "<account>"
job_time: "02:00:00"
job_name: "<job_name>"
numa_bind: true # Only enable for GB200 NVL72
# Hardware Configuration
hardware:
gpus_per_node: 4 # Modify this with your hardware configuration
num_ctx_servers: 2 # Number of context servers
num_gen_servers: 1 # Number of generation servers
# Benchmark Mode
benchmark:
mode: "e2e" # Options: e2e, gen_only
use_nv_sa_benchmark: false # Whether to use NVIDIA SA benchmark script
multi_round: 1 # Number of benchmark rounds
benchmark_ratio: 0.8 # Benchmark ratio
streaming: true # Enable streaming mode
concurrency_list: "1024"
# Sequence Configuration
sequence:
input_length: 8196 # Input sequence length
output_length: 1024 # Output sequence length
# Environment Configuration
environment:
container_mount: "<container_mount>" # Format: path1:path1,path2:path2
container_image: "<container_image>"
model_path: "<model_path>"
trtllm_repo: "<trtllm_repo>"
build_wheel: false # Don't build the wheel when launching multiple jobs
dataset_file: "<dataset_file>"
work_dir: "<full_path_to_work_dir>"
# Profiling Configuration
profiling:
nsys_on: false # Set to true to enable profiling
# Worker Configuration
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
pipeline_parallel_size: 1
max_batch_size: 128
max_num_tokens: 512
max_seq_len: 9236
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
- 768
- 1024
- 2048
- 128
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: WIDEEP
use_low_precision_moe_combine: true
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 1
max_num_tokens: 8448
max_seq_len: 8212
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: true
pipeline_parallel_size: 1
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.75
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: DEFAULT
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3