mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Signed-off-by: Zero Zeng <38289304+zerollzeng@users.noreply.github.com> Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Co-authored-by: Zero Zeng <38289304+zerollzeng@users.noreply.github.com>
114 lines
2.9 KiB
YAML
114 lines
2.9 KiB
YAML
# SLURM Configuration
|
|
slurm:
|
|
script_file: "disaggr_torch.slurm"
|
|
partition: "<partition>"
|
|
account: "<account>"
|
|
job_time: "02:00:00"
|
|
job_name: "<job_name>"
|
|
numa_bind: true # Only enable for GB200 NVL72
|
|
|
|
# Hardware Configuration
|
|
hardware:
|
|
gpus_per_node: 4 # Modify this with your hardware configuration
|
|
num_ctx_servers: 2 # Number of context servers
|
|
num_gen_servers: 1 # Number of generation servers
|
|
|
|
# Benchmark Mode
|
|
benchmark:
|
|
mode: "e2e" # Options: e2e, gen_only
|
|
use_nv_sa_benchmark: false # Whether to use NVIDIA SA benchmark script
|
|
multi_round: 1 # Number of benchmark rounds
|
|
benchmark_ratio: 0.8 # Benchmark ratio
|
|
streaming: true # Enable streaming mode
|
|
concurrency_list: "1024"
|
|
|
|
# Sequence Configuration
|
|
sequence:
|
|
input_length: 8196 # Input sequence length
|
|
output_length: 1024 # Output sequence length
|
|
|
|
# Environment Configuration
|
|
environment:
|
|
container_mount: "<container_mount>" # Format: path1:path1,path2:path2
|
|
container_image: "<container_image>"
|
|
model_path: "<model_path>"
|
|
trtllm_repo: "<trtllm_repo>"
|
|
build_wheel: false # Don't build the wheel when launching multiple jobs
|
|
dataset_file: "<dataset_file>"
|
|
work_dir: "<full_path_to_work_dir>"
|
|
|
|
# Profiling Configuration
|
|
profiling:
|
|
nsys_on: false # Set to true to enable profiling
|
|
|
|
# Worker Configuration
|
|
worker_config:
|
|
gen:
|
|
enable_layerwise_nvtx_marker: true
|
|
tensor_parallel_size: 32
|
|
moe_expert_parallel_size: 32
|
|
enable_attention_dp: true
|
|
enable_lm_head_tp_in_adp: true
|
|
pipeline_parallel_size: 1
|
|
max_batch_size: 128
|
|
max_num_tokens: 512
|
|
max_seq_len: 9236
|
|
cuda_graph_config:
|
|
enable_padding: true
|
|
batch_sizes:
|
|
- 1
|
|
- 2
|
|
- 4
|
|
- 8
|
|
- 16
|
|
- 32
|
|
- 64
|
|
- 128
|
|
- 256
|
|
- 512
|
|
- 768
|
|
- 1024
|
|
- 2048
|
|
- 128
|
|
print_iter_log: true
|
|
kv_cache_config:
|
|
enable_block_reuse: false
|
|
free_gpu_memory_fraction: 0.6
|
|
dtype: fp8
|
|
moe_config:
|
|
backend: WIDEEP
|
|
use_low_precision_moe_combine: true
|
|
load_balancer:
|
|
num_slots: 288
|
|
layer_updates_per_iter: 1
|
|
cache_transceiver_config:
|
|
max_tokens_in_buffer: 8448
|
|
backend: DEFAULT
|
|
stream_interval: 20
|
|
num_postprocess_workers: 4
|
|
speculative_config:
|
|
decoding_type: MTP
|
|
num_nextn_predict_layers: 3
|
|
ctx:
|
|
enable_layerwise_nvtx_marker: true
|
|
max_batch_size: 1
|
|
max_num_tokens: 8448
|
|
max_seq_len: 8212
|
|
tensor_parallel_size: 4
|
|
moe_expert_parallel_size: 4
|
|
enable_attention_dp: true
|
|
pipeline_parallel_size: 1
|
|
print_iter_log: true
|
|
cuda_graph_config: null
|
|
disable_overlap_scheduler: true
|
|
kv_cache_config:
|
|
enable_block_reuse: false
|
|
free_gpu_memory_fraction: 0.75
|
|
dtype: fp8
|
|
cache_transceiver_config:
|
|
max_tokens_in_buffer: 8448
|
|
backend: DEFAULT
|
|
speculative_config:
|
|
decoding_type: MTP
|
|
num_nextn_predict_layers: 3
|