runtime: trtllm compile_backend: torch-cudagraph max_batch_size: 384 max_seq_len: 65536 # tunable enable_chunked_prefill: true attn_backend: flashinfer model_factory: AutoModelForCausalLM skip_loading_weights: false free_mem_ratio: 0.9 cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384] kv_cache_config: # disable kv_cache reuse since not supported for hybrid/ssm models enable_block_reuse: false transforms: detect_sharding: sharding_source: ['factory', 'heuristic'] sharding_dims: ['ep', 'bmm'] # tunable mamba cache dtype # --> use float32 for accuracy and default (null) for speed insert_cached_ssm_attention: cache_config: # mamba_dtype: float32 mamba_dtype: null