runtime: trtllm compile_backend: torch-cudagraph max_batch_size: 384 max_seq_len: 65536 # tunable enable_chunked_prefill: true attn_backend: flashinfer model_factory: AutoModelForCausalLM skip_loading_weights: false # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/9884 free_mem_ratio: 0.88 cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384] kv_cache_config: # disable kv_cache reuse since not supported for hybrid/ssm models enable_block_reuse: false transforms: detect_sharding: allreduce_strategy: SYMM_MEM sharding_dims: ['ep', 'bmm'] manual_config: head_dim: 128 tp_plan: # mamba SSM layer "in_proj": "mamba" "out_proj": "rowwise" # attention layer "q_proj": "colwise" "k_proj": "colwise" "v_proj": "colwise" "o_proj": "rowwise" # NOTE: consider not sharding shared experts and/or # latent projections at all, keeping them replicated. # To do so, comment out the corresponding entries. # moe layer: SHARED experts "up_proj": "colwise" "down_proj": "rowwise" # MoLE: latent projections: simple shard "fc1_latent_proj": "gather" "fc2_latent_proj": "gather" multi_stream_moe: stage: compile enabled: true # tunable mamba cache dtype # --> use float32 for accuracy and default (null) for speed insert_cached_ssm_attention: cache_config: # mamba_dtype: float32 mamba_dtype: null gather_logits_before_lm_head: # TODO: fix https://github.com/NVIDIA/TensorRT-LLM/issues/9878 to enable by default enabled: true fuse_mamba_a_log: stage: post_load_fusion enabled: true