From 38bcee189cf6b16983b912a192b63880029b3d81 Mon Sep 17 00:00:00 2001
From: Grzegorz Kwasniewski
 <213329731+greg-kwasniewski1@users.noreply.github.com>
Date: Wed, 28 Jan 2026 10:34:10 +0100
Subject: [PATCH] [TRTLLM-10362][feat] Added Mamba and MLA layers to the
 sharding tests (#10364)

Signed-off-by: greg-kwasniewski1 <213329731+greg-kwasniewski1@users.noreply.github.com>
Signed-off-by: Grzegorz Kwasniewski <213329731+greg-kwasniewski1@users.noreply.github.com>
---
 jenkins/L0_MergeRequest.groovy                 |  2 ++
 .../defs/accuracy/test_llm_api_autodeploy.py   | 18 +++++++++++-------
 .../integration/test_lists/test-db/l0_b200.yml |  2 +-
 .../test_lists/test-db/l0_dgx_b200.yml         |  1 +
 .../integration/test_lists/test-db/l0_h100.yml |  4 ++--
 .../library/test_tp_sharding.py                |  6 +++---
 6 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy
index e8578c0b62..e2a7f78e31 100644
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@@ -719,6 +719,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
         "tensorrt_llm/_torch/pyexecutor/_util.py",
         "tensorrt_llm/_torch/pyexecutor/model_engine.py",
         "tensorrt_llm/_torch/pyexecutor/py_executor.py",
+        "tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py",
         "tensorrt_llm/evaluate/json_mode_eval.py",
         "tensorrt_llm/evaluate/mmlu.py",
         "tensorrt_llm/executor/",
@@ -740,6 +741,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
         "tests/integration/defs/accuracy/test_disaggregated_serving.py",
         "tests/unittest/_torch/ray_orchestrator/multi_gpu/",
         "tests/integration/defs/examples/test_ray.py",
+        "tests/integration/defs/accuracy/test_llm_api_autodeploy.py",
         "tests/unittest/llmapi/test_async_llm.py",
         "docker/common/install_ucx.sh",
         "docker/common/install_nixl.sh",
diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index 9dd02a392e..d46820ce90 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -161,7 +161,7 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
     MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-Nano-3-30B-A3.5B-FP8-KVFP8-dev"
     MODEL_PATH_NVFP4 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4"
 
-    def get_default_kwargs(self):
+    def get_default_kwargs(self, world_size=1):
         return {
             "skip_tokenizer_init": False,
             "trust_remote_code": True,
@@ -189,7 +189,8 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
                 },
                 "multi_stream_moe": {
                     "stage": "compile",
-                    "enabled": True,
+                    # multi-stream MOE currently does not work for world_size > 1
+                    "enabled": world_size == 1,
                 },
             }
         }
@@ -203,14 +204,16 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
                               use_beam_search=beam_width > 1)
 
     @pytest.mark.skip_less_device_memory(32000)
-    def test_bf16(self):
-        kwargs = self.get_default_kwargs()
+    @pytest.mark.parametrize("world_size", [1, 4])
+    def test_bf16(self, world_size):
+        kwargs = self.get_default_kwargs(world_size=world_size)
         # TODO: multi-stream MOE seems to increase the memory usage
         kwargs["max_batch_size"] = 32
         kwargs["kv_cache_config"] = {"free_gpu_memory_fraction": 0.4}
         sampling_params = self.get_default_sampling_params()
         with AutoDeployLLM(model=self.MODEL_PATH_BF16,
                            tokenizer=self.MODEL_PATH_BF16,
+                           world_size=world_size,
                            **kwargs) as llm:
             sampling_params = self.get_default_sampling_params()
             task = MMLU(self.MODEL_NAME)
@@ -219,11 +222,12 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
             task.evaluate(llm)
 
     @pytest.mark.skip_less_device_memory(32000)
-    def test_fp8(self):
-        kwargs = self.get_default_kwargs()
-        kwargs["max_batch_size"] = 64
+    @pytest.mark.parametrize("world_size", [1, 4])
+    def test_fp8(self, world_size):
+        kwargs = self.get_default_kwargs(world_size=world_size)
         with AutoDeployLLM(model=self.MODEL_PATH_FP8,
                            tokenizer=self.MODEL_PATH_FP8,
+                           world_size=world_size,
                            **kwargs) as llm:
             # Manually set quant_config for FP8 model to get the accuracy threshold
             llm.args.quant_config.quant_algo = QuantAlgo.FP8
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
index 1ccfa2e60d..55b3bdd982 100644
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -174,5 +174,5 @@ l0_b200:
       backend: autodeploy
   tests:
   - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1]
-  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[1]
   - unittest/_torch/auto_deploy/unit/singlegpu
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 4f7a1200cd..4fb2c17f73 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -215,6 +215,7 @@ l0_dgx_b200:
   tests:
   - unittest/_torch/auto_deploy/unit/multigpu
   - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[4]
   - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16
   - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]
   - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[8]
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index 605943fefe..bb498ddeeb 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -436,8 +436,8 @@ l0_h100:
   - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[True-1]
   - accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[False]
   - accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True]
-  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
-  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[1]
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16[1]
   - examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[draft_target]
   - examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[eagle3]
   - examples/test_ad_speculative_decoding.py::test_autodeploy_eagle3_acceptance_rate
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py
index 170d9e9b5c..b66a6a54eb 100644
--- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py
+++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py
@@ -548,7 +548,7 @@ def _run_pattern_detection_job(
                             fused_weight_dims=fused_weight_dims,
                         )
                     )
-                if is_op(node, torch.ops.auto_deploy.torch_causal_conv1d):
+                elif is_op(node, torch.ops.auto_deploy.torch_causal_conv1d):
                     expected_transformations.append(
                         WeightShardingInfo(
                             target_node=node.name,
@@ -560,7 +560,7 @@ def _run_pattern_detection_job(
                             fused_weight_dims=(num_features, 16 * num_heads, 16 * num_heads),
                         )
                     )
-                if is_op(node, torch.ops.auto_deploy.torch_ssm):
+                elif is_op(node, torch.ops.auto_deploy.torch_ssm):
                     expected_transformations.append(
                         WeightShardingInfo(
                             target_node=node.name,
@@ -572,7 +572,7 @@ def _run_pattern_detection_job(
                             fused_weight_dims=None,
                         )
                     )
-                if len(node.args) > 1 and (
+                elif len(node.args) > 1 and (
                     "norm_weight" in node.args[0].name or "a_log" in node.args[0].name
                 ):
                     expected_transformations.append(