[TRTLLM-10362][feat] Added Mamba and MLA layers to the sharding tests (#10364)

Signed-off-by: greg-kwasniewski1 <213329731+greg-kwasniewski1@users.noreply.github.com> Signed-off-by: Grzegorz Kwasniewski <213329731+greg-kwasniewski1@users.noreply.github.com>
2026-02-04 18:21:52 +08:00 · 2026-01-28 10:34:10 +01:00 · 2026-01-28 10:34:10 +01:00 · 38bcee189c
commit 38bcee189c
parent 3e17ee4e38
6 changed files with 20 additions and 13 deletions
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@ -719,6 +719,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
        "tensorrt_llm/_torch/pyexecutor/_util.py",
        "tensorrt_llm/_torch/pyexecutor/model_engine.py",
        "tensorrt_llm/_torch/pyexecutor/py_executor.py",
+        "tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py",
        "tensorrt_llm/evaluate/json_mode_eval.py",
        "tensorrt_llm/evaluate/mmlu.py",
        "tensorrt_llm/executor/",
@ -740,6 +741,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
        "tests/integration/defs/accuracy/test_disaggregated_serving.py",
        "tests/unittest/_torch/ray_orchestrator/multi_gpu/",
        "tests/integration/defs/examples/test_ray.py",
+        "tests/integration/defs/accuracy/test_llm_api_autodeploy.py",
        "tests/unittest/llmapi/test_async_llm.py",
        "docker/common/install_ucx.sh",
        "docker/common/install_nixl.sh",
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@ -161,7 +161,7 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
    MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-Nano-3-30B-A3.5B-FP8-KVFP8-dev"
    MODEL_PATH_NVFP4 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4"

-    def get_default_kwargs(self):
+    def get_default_kwargs(self, world_size=1):
        return {
            "skip_tokenizer_init": False,
            "trust_remote_code": True,
@ -189,7 +189,8 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
                },
                "multi_stream_moe": {
                    "stage": "compile",
-                    "enabled": True,
+                    # multi-stream MOE currently does not work for world_size > 1
+                    "enabled": world_size == 1,
                },
            }
        }
@ -203,14 +204,16 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
                              use_beam_search=beam_width > 1)

    @pytest.mark.skip_less_device_memory(32000)
-    def test_bf16(self):
-        kwargs = self.get_default_kwargs()
+    @pytest.mark.parametrize("world_size", [1, 4])
+    def test_bf16(self, world_size):
+        kwargs = self.get_default_kwargs(world_size=world_size)
        # TODO: multi-stream MOE seems to increase the memory usage
        kwargs["max_batch_size"] = 32
        kwargs["kv_cache_config"] = {"free_gpu_memory_fraction": 0.4}
        sampling_params = self.get_default_sampling_params()
        with AutoDeployLLM(model=self.MODEL_PATH_BF16,
                           tokenizer=self.MODEL_PATH_BF16,
+                           world_size=world_size,
                           **kwargs) as llm:
            sampling_params = self.get_default_sampling_params()
            task = MMLU(self.MODEL_NAME)
@ -219,11 +222,12 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
            task.evaluate(llm)

    @pytest.mark.skip_less_device_memory(32000)
-    def test_fp8(self):
-        kwargs = self.get_default_kwargs()
-        kwargs["max_batch_size"] = 64
+    @pytest.mark.parametrize("world_size", [1, 4])
+    def test_fp8(self, world_size):
+        kwargs = self.get_default_kwargs(world_size=world_size)
        with AutoDeployLLM(model=self.MODEL_PATH_FP8,
                           tokenizer=self.MODEL_PATH_FP8,
+                           world_size=world_size,
                           **kwargs) as llm:
            # Manually set quant_config for FP8 model to get the accuracy threshold
            llm.args.quant_config.quant_algo = QuantAlgo.FP8
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@ -174,5 +174,5 @@ l0_b200:
      backend: autodeploy
  tests:
  - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1]
-  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[1]
  - unittest/_torch/auto_deploy/unit/singlegpu
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@ -215,6 +215,7 @@ l0_dgx_b200:
  tests:
  - unittest/_torch/auto_deploy/unit/multigpu
  - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[4]
  - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16
  - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]
  - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[8]
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@ -436,8 +436,8 @@ l0_h100:
  - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[True-1]
  - accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[False]
  - accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True]
-  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
-  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[1]
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16[1]
  - examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[draft_target]
  - examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[eagle3]
  - examples/test_ad_speculative_decoding.py::test_autodeploy_eagle3_acceptance_rate
--- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py
+++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py
@ -548,7 +548,7 @@ def _run_pattern_detection_job(
                            fused_weight_dims=fused_weight_dims,
                        )
                    )
-                if is_op(node, torch.ops.auto_deploy.torch_causal_conv1d):
+                elif is_op(node, torch.ops.auto_deploy.torch_causal_conv1d):
                    expected_transformations.append(
                        WeightShardingInfo(
                            target_node=node.name,
@ -560,7 +560,7 @@ def _run_pattern_detection_job(
                            fused_weight_dims=(num_features, 16 * num_heads, 16 * num_heads),
                        )
                    )
-                if is_op(node, torch.ops.auto_deploy.torch_ssm):
+                elif is_op(node, torch.ops.auto_deploy.torch_ssm):
                    expected_transformations.append(
                        WeightShardingInfo(
                            target_node=node.name,
@ -572,7 +572,7 @@ def _run_pattern_detection_job(
                            fused_weight_dims=None,
                        )
                    )
-                if len(node.args) > 1 and (
+                elif len(node.args) > 1 and (
                    "norm_weight" in node.args[0].name or "a_log" in node.args[0].name
                ):
                    expected_transformations.append(