TensorRT-LLMs/examples/disaggregated/slurm/benchmark/config.yaml

# SLURM Configuration
slurm:
  script_file: "disaggr_torch.slurm"
  partition: "<partition>"
  account: "<account>"
  job_time: "02:00:00"
  job_name: "<job_name>"
  extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2"
  set_segment: true # Optional: whether to set the segment for the job
  numa_bind: true # Only enable for GB200/GB300 NVL72

# Benchmark Mode
benchmark:
  mode: "e2e"  # Options: e2e, gen_only, gen_only_no_context
  use_nv_sa_benchmark: false  # Whether to use NVIDIA SA benchmark script
  multi_round: 8  # Number of benchmark rounds
  benchmark_ratio: 0.8  # Benchmark ratio
  streaming: true  # Enable streaming mode
  concurrency_list: "16"
  input_length: 1024  # Input sequence length
  output_length: 1024  # Output sequence length
  dataset_file: "<dataset_file>"

# Hardware Configuration
hardware:
  gpus_per_node: 4  # Modify this with your hardware configuration
  num_ctx_servers: 1  # Number of context servers
  num_gen_servers: 1  # Number of generation servers

# Environment Configuration
environment:
  container_mount: "<container_mount>"  # Format: path1:path1,path2:path2
  container_image: "<container_image>"
  model_path: "<model_path>"
  trtllm_repo: "<trtllm_repo>"
  build_wheel: false  # Don't build the wheel when launching multiple jobs
  cuda_architectures: ""  # Optional CUDA architectures to build for (e.g. "90-real;100-real"). If empty, builds for all architectures
  trtllm_wheel_path: ""  # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
  work_dir: "<full_path_to_work_dir>"
  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes NCCL_GRAPH_MIXING_SUPPORT=0"
  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"

# Profiling Configuration
profiling:
  nsys_on: false  # Set to true to enable profiling
  ctx_profile_range: "10-30"  # Set TLLM_PROFILE_START_STOP for ctx workers
  gen_profile_range: "200-250"  # Set TLLM_PROFILE_START_STOP for gen workers

# Accuracy Configuration
accuracy:
  enable_accuracy_test: false  # Set to true to enable accuracy evaluation
  tasks:
    gsm8k:
      model: "local-completions"  # Model type for lm_eval
      model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=7200,max_gen_toks=16384"
      extra_kwargs:
        trust_remote_code: true

worker_config:
  gen:
    tensor_parallel_size: 8
    moe_expert_parallel_size: 8
    enable_attention_dp: true
    enable_lm_head_tp_in_adp: true
    pipeline_parallel_size: 1
    context_parallel_size: 1
    # Uncomment this section to enable context parallelism.
    # cp_config:
    #   cp_type: "HELIX"
    #   tokens_per_block: 32  # must match kv_config.tokens_per_block.
    max_batch_size: 256
    max_num_tokens: 512
    max_seq_len: 2251
    cuda_graph_config:
      enable_padding: true
      batch_sizes:
      - 1
      - 2
      - 4
      - 8
      - 16
      - 32
      - 64
      - 128
      - 256
      - 512
      - 768
      - 1024
      - 2048
      - 256
    print_iter_log: true
    trust_remote_code: true
    kv_cache_config:
      enable_block_reuse: false
      tokens_per_block: 32
      free_gpu_memory_fraction: 0.8
      dtype: fp8
    moe_config:
      backend: CUTLASS
      use_low_precision_moe_combine: true
      load_balancer:
        num_slots: 0
    cache_transceiver_config:
      max_tokens_in_buffer: 4608
      backend: DEFAULT
    stream_interval: 20
    num_postprocess_workers: 4
    speculative_config: # when mtp_size is 0, remove this section.
      decoding_type: MTP
      num_nextn_predict_layers: 1
  ctx:
    max_batch_size: 4
    max_num_tokens: 4608
    max_seq_len: 1227
    tensor_parallel_size: 4
    context_parallel_size: 1
    moe_expert_parallel_size: 4
    enable_attention_dp: true
    pipeline_parallel_size: 1
    print_iter_log: true
    trust_remote_code: true
    cuda_graph_config: null
    disable_overlap_scheduler: true
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.85
      dtype: fp8
    cache_transceiver_config:
      max_tokens_in_buffer: 4608
      backend: DEFAULT
    speculative_config: # when mtp_size is 0, remove this section.
      decoding_type: MTP
      num_nextn_predict_layers: 1