TensorRT-LLMs/examples/disaggregated/slurm/benchmark/config.yaml
Zhenhuan Chen 066fa4cd93
[None][chore] update config.yaml of slurm scripts to align with submit.py change (#10802)
Signed-off-by: Zhenhuan Chen <zhenhuanc@nvidia.com>
2026-01-19 14:46:23 -05:00

134 lines
4.2 KiB
YAML

# SLURM Configuration
slurm:
script_file: "disaggr_torch.slurm"
partition: "<partition>"
account: "<account>"
job_time: "02:00:00"
job_name: "<job_name>"
extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2"
set_segment: true # Optional: whether to set the segment for the job
numa_bind: true # Only enable for GB200/GB300 NVL72
# Benchmark Mode
benchmark:
mode: "e2e" # Options: e2e, gen_only, gen_only_no_context
use_nv_sa_benchmark: false # Whether to use NVIDIA SA benchmark script
multi_round: 8 # Number of benchmark rounds
benchmark_ratio: 0.8 # Benchmark ratio
streaming: true # Enable streaming mode
concurrency_list: "16"
input_length: 1024 # Input sequence length
output_length: 1024 # Output sequence length
dataset_file: "<dataset_file>"
# Hardware Configuration
hardware:
gpus_per_node: 4 # Modify this with your hardware configuration
num_ctx_servers: 1 # Number of context servers
num_gen_servers: 1 # Number of generation servers
# Environment Configuration
environment:
container_mount: "<container_mount>" # Format: path1:path1,path2:path2
container_image: "<container_image>"
model_path: "<model_path>"
trtllm_repo: "<trtllm_repo>"
build_wheel: false # Don't build the wheel when launching multiple jobs
cuda_architectures: "" # Optional CUDA architectures to build for (e.g. "90-real;100-real"). If empty, builds for all architectures
trtllm_wheel_path: "" # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
work_dir: "<full_path_to_work_dir>"
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes NCCL_GRAPH_MIXING_SUPPORT=0"
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
# Profiling Configuration
profiling:
nsys_on: false # Set to true to enable profiling
ctx_profile_range: "10-30" # Set TLLM_PROFILE_START_STOP for ctx workers
gen_profile_range: "200-250" # Set TLLM_PROFILE_START_STOP for gen workers
# Accuracy Configuration
accuracy:
enable_accuracy_test: false # Set to true to enable accuracy evaluation
tasks:
gsm8k:
model: "local-completions" # Model type for lm_eval
model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=7200,max_gen_toks=16384"
extra_kwargs:
trust_remote_code: true
worker_config:
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
pipeline_parallel_size: 1
context_parallel_size: 1
# Uncomment this section to enable context parallelism.
# cp_config:
# cp_type: "HELIX"
# tokens_per_block: 32 # must match kv_config.tokens_per_block.
max_batch_size: 256
max_num_tokens: 512
max_seq_len: 2251
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
- 768
- 1024
- 2048
- 256
print_iter_log: true
trust_remote_code: true
kv_cache_config:
enable_block_reuse: false
tokens_per_block: 32
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: CUTLASS
use_low_precision_moe_combine: true
load_balancer:
num_slots: 0
cache_transceiver_config:
max_tokens_in_buffer: 4608
backend: DEFAULT
stream_interval: 20
num_postprocess_workers: 4
speculative_config: # when mtp_size is 0, remove this section.
decoding_type: MTP
num_nextn_predict_layers: 1
ctx:
max_batch_size: 4
max_num_tokens: 4608
max_seq_len: 1227
tensor_parallel_size: 4
context_parallel_size: 1
moe_expert_parallel_size: 4
enable_attention_dp: true
pipeline_parallel_size: 1
print_iter_log: true
trust_remote_code: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.85
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 4608
backend: DEFAULT
speculative_config: # when mtp_size is 0, remove this section.
decoding_type: MTP
num_nextn_predict_layers: 1