From 31314b9fedc186e9365ccdc5777abb3b0c62c8ed Mon Sep 17 00:00:00 2001
From: Eran Geva <19514940+MrGeva@users.noreply.github.com>
Date: Thu, 12 Feb 2026 08:37:42 +0200
Subject: [PATCH] [None][chore] added AutoDeploy nano_v3_scale.yaml (#10845)

Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com>
---
 .../auto_deploy/nano_v3_multi_device.yaml     | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 examples/auto_deploy/nano_v3_multi_device.yaml

diff --git a/examples/auto_deploy/nano_v3_multi_device.yaml b/examples/auto_deploy/nano_v3_multi_device.yaml
new file mode 100644
index 0000000000..b311c19017
--- /dev/null
+++ b/examples/auto_deploy/nano_v3_multi_device.yaml
@@ -0,0 +1,51 @@
+runtime: trtllm
+compile_backend: torch-cudagraph
+max_batch_size: 384
+max_seq_len: 65536 # tunable
+enable_chunked_prefill: true
+attn_backend: flashinfer
+model_factory: AutoModelForCausalLM
+skip_loading_weights: false
+sampler_type: "TRTLLMSampler"
+cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384]
+kv_cache_config:
+  free_gpu_memory_fraction: 0.88
+  # tunable mamba cache dtype
+  # --> use float32 for accuracy and default (auto) for speed
+  mamba_ssm_cache_dtype: auto
+transforms:
+  detect_sharding:
+    allreduce_strategy: SYMM_MEM
+    sharding_dims: ['tp','ep', 'bmm']
+    process_grid: {'tp': 8, 'ep': 1}
+    manual_config:
+      head_dim: 128
+      tp_plan:
+        # mamba SSM layer
+        "in_proj": "mamba"
+        "out_proj": "rowwise"
+        # attention layer
+        "q_proj": "colwise"
+        "k_proj": "colwise"
+        "v_proj": "colwise"
+        "o_proj": "rowwise"
+        # NOTE: consider not sharding shared experts and/or
+        # latent projections at all, keeping them replicated.
+        # To do so, comment out the corresponding entries.
+        # moe layer: SHARED experts
+        # "up_proj": "colwise"
+        # "down_proj": "rowwise"
+        # MoLE: latent projections: simple shard
+        # "fc1_latent_proj": "gather"
+        # "fc2_latent_proj": "gather"
+  multi_stream_moe:
+    stage: compile
+    enabled: false
+  gather_logits_before_lm_head:
+    # TODO: fix https://github.com/NVIDIA/TensorRT-LLM/issues/9878 to enable by default
+    enabled: true
+  fuse_mamba_a_log:
+    stage: post_load_fusion
+    enabled: true
+  insert_cached_ssm_attention:
+    backend: flashinfer_ssm