mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-13 06:23:57 +08:00
134 lines
4.2 KiB
YAML
134 lines
4.2 KiB
YAML
# SLURM Configuration
|
|
slurm:
|
|
script_file: "disaggr_torch.slurm"
|
|
partition: "<partition>"
|
|
account: "<account>"
|
|
job_time: "02:00:00"
|
|
job_name: "<job_name>"
|
|
extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2"
|
|
set_segment: true # Optional: whether to set the segment for the job
|
|
numa_bind: true # Only enable for GB200/GB300 NVL72
|
|
|
|
# Benchmark Mode
|
|
benchmark:
|
|
mode: "e2e" # Options: e2e, gen_only, gen_only_no_context
|
|
use_nv_sa_benchmark: false # Whether to use NVIDIA SA benchmark script
|
|
multi_round: 8 # Number of benchmark rounds
|
|
benchmark_ratio: 0.8 # Benchmark ratio
|
|
streaming: true # Enable streaming mode
|
|
concurrency_list: "16"
|
|
input_length: 1024 # Input sequence length
|
|
output_length: 1024 # Output sequence length
|
|
dataset_file: "<dataset_file>"
|
|
|
|
# Hardware Configuration
|
|
hardware:
|
|
gpus_per_node: 4 # Modify this with your hardware configuration
|
|
num_ctx_servers: 1 # Number of context servers
|
|
num_gen_servers: 1 # Number of generation servers
|
|
|
|
# Environment Configuration
|
|
environment:
|
|
container_mount: "<container_mount>" # Format: path1:path1,path2:path2
|
|
container_image: "<container_image>"
|
|
model_path: "<model_path>"
|
|
trtllm_repo: "<trtllm_repo>"
|
|
build_wheel: false # Don't build the wheel when launching multiple jobs
|
|
cuda_architectures: "" # Optional CUDA architectures to build for (e.g. "90-real;100-real"). If empty, builds for all architectures
|
|
trtllm_wheel_path: "" # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
|
|
work_dir: "<full_path_to_work_dir>"
|
|
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes NCCL_GRAPH_MIXING_SUPPORT=0"
|
|
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
|
|
|
# Profiling Configuration
|
|
profiling:
|
|
nsys_on: false # Set to true to enable profiling
|
|
ctx_profile_range: "10-30" # Set TLLM_PROFILE_START_STOP for ctx workers
|
|
gen_profile_range: "200-250" # Set TLLM_PROFILE_START_STOP for gen workers
|
|
|
|
# Accuracy Configuration
|
|
accuracy:
|
|
enable_accuracy_test: false # Set to true to enable accuracy evaluation
|
|
tasks:
|
|
gsm8k:
|
|
model: "local-completions" # Model type for lm_eval
|
|
model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=7200,max_gen_toks=16384"
|
|
extra_kwargs:
|
|
trust_remote_code: true
|
|
|
|
worker_config:
|
|
gen:
|
|
tensor_parallel_size: 8
|
|
moe_expert_parallel_size: 8
|
|
enable_attention_dp: true
|
|
enable_lm_head_tp_in_adp: true
|
|
pipeline_parallel_size: 1
|
|
context_parallel_size: 1
|
|
# Uncomment this section to enable context parallelism.
|
|
# cp_config:
|
|
# cp_type: "HELIX"
|
|
# tokens_per_block: 32 # must match kv_config.tokens_per_block.
|
|
max_batch_size: 256
|
|
max_num_tokens: 512
|
|
max_seq_len: 2251
|
|
cuda_graph_config:
|
|
enable_padding: true
|
|
batch_sizes:
|
|
- 1
|
|
- 2
|
|
- 4
|
|
- 8
|
|
- 16
|
|
- 32
|
|
- 64
|
|
- 128
|
|
- 256
|
|
- 512
|
|
- 768
|
|
- 1024
|
|
- 2048
|
|
- 256
|
|
print_iter_log: true
|
|
trust_remote_code: true
|
|
kv_cache_config:
|
|
enable_block_reuse: false
|
|
tokens_per_block: 32
|
|
free_gpu_memory_fraction: 0.8
|
|
dtype: fp8
|
|
moe_config:
|
|
backend: CUTLASS
|
|
use_low_precision_moe_combine: true
|
|
load_balancer:
|
|
num_slots: 0
|
|
cache_transceiver_config:
|
|
max_tokens_in_buffer: 4608
|
|
backend: DEFAULT
|
|
stream_interval: 20
|
|
num_postprocess_workers: 4
|
|
speculative_config: # when mtp_size is 0, remove this section.
|
|
decoding_type: MTP
|
|
num_nextn_predict_layers: 1
|
|
ctx:
|
|
max_batch_size: 4
|
|
max_num_tokens: 4608
|
|
max_seq_len: 1227
|
|
tensor_parallel_size: 4
|
|
context_parallel_size: 1
|
|
moe_expert_parallel_size: 4
|
|
enable_attention_dp: true
|
|
pipeline_parallel_size: 1
|
|
print_iter_log: true
|
|
trust_remote_code: true
|
|
cuda_graph_config: null
|
|
disable_overlap_scheduler: true
|
|
kv_cache_config:
|
|
enable_block_reuse: false
|
|
free_gpu_memory_fraction: 0.85
|
|
dtype: fp8
|
|
cache_transceiver_config:
|
|
max_tokens_in_buffer: 4608
|
|
backend: DEFAULT
|
|
speculative_config: # when mtp_size is 0, remove this section.
|
|
decoding_type: MTP
|
|
num_nextn_predict_layers: 1
|