From 31314b9fedc186e9365ccdc5777abb3b0c62c8ed Mon Sep 17 00:00:00 2001 From: Eran Geva <19514940+MrGeva@users.noreply.github.com> Date: Thu, 12 Feb 2026 08:37:42 +0200 Subject: [PATCH] [None][chore] added AutoDeploy nano_v3_scale.yaml (#10845) Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com> --- .../auto_deploy/nano_v3_multi_device.yaml | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 examples/auto_deploy/nano_v3_multi_device.yaml diff --git a/examples/auto_deploy/nano_v3_multi_device.yaml b/examples/auto_deploy/nano_v3_multi_device.yaml new file mode 100644 index 0000000000..b311c19017 --- /dev/null +++ b/examples/auto_deploy/nano_v3_multi_device.yaml @@ -0,0 +1,51 @@ +runtime: trtllm +compile_backend: torch-cudagraph +max_batch_size: 384 +max_seq_len: 65536 # tunable +enable_chunked_prefill: true +attn_backend: flashinfer +model_factory: AutoModelForCausalLM +skip_loading_weights: false +sampler_type: "TRTLLMSampler" +cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384] +kv_cache_config: + free_gpu_memory_fraction: 0.88 + # tunable mamba cache dtype + # --> use float32 for accuracy and default (auto) for speed + mamba_ssm_cache_dtype: auto +transforms: + detect_sharding: + allreduce_strategy: SYMM_MEM + sharding_dims: ['tp','ep', 'bmm'] + process_grid: {'tp': 8, 'ep': 1} + manual_config: + head_dim: 128 + tp_plan: + # mamba SSM layer + "in_proj": "mamba" + "out_proj": "rowwise" + # attention layer + "q_proj": "colwise" + "k_proj": "colwise" + "v_proj": "colwise" + "o_proj": "rowwise" + # NOTE: consider not sharding shared experts and/or + # latent projections at all, keeping them replicated. + # To do so, comment out the corresponding entries. + # moe layer: SHARED experts + # "up_proj": "colwise" + # "down_proj": "rowwise" + # MoLE: latent projections: simple shard + # "fc1_latent_proj": "gather" + # "fc2_latent_proj": "gather" + multi_stream_moe: + stage: compile + enabled: false + gather_logits_before_lm_head: + # TODO: fix https://github.com/NVIDIA/TensorRT-LLM/issues/9878 to enable by default + enabled: true + fuse_mamba_a_log: + stage: post_load_fusion + enabled: true + insert_cached_ssm_attention: + backend: flashinfer_ssm