[TRTLLM-10362][feat] Added Mamba and MLA layers to the sharding tests (#10364)

Signed-off-by: greg-kwasniewski1 <213329731+greg-kwasniewski1@users.noreply.github.com>
Signed-off-by: Grzegorz Kwasniewski <213329731+greg-kwasniewski1@users.noreply.github.com>
This commit is contained in:
Grzegorz Kwasniewski 2026-01-28 10:34:10 +01:00 committed by GitHub
parent 3e17ee4e38
commit 38bcee189c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 20 additions and 13 deletions

View File

@ -719,6 +719,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
"tensorrt_llm/_torch/pyexecutor/_util.py",
"tensorrt_llm/_torch/pyexecutor/model_engine.py",
"tensorrt_llm/_torch/pyexecutor/py_executor.py",
"tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py",
"tensorrt_llm/evaluate/json_mode_eval.py",
"tensorrt_llm/evaluate/mmlu.py",
"tensorrt_llm/executor/",
@ -740,6 +741,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
"tests/integration/defs/accuracy/test_disaggregated_serving.py",
"tests/unittest/_torch/ray_orchestrator/multi_gpu/",
"tests/integration/defs/examples/test_ray.py",
"tests/integration/defs/accuracy/test_llm_api_autodeploy.py",
"tests/unittest/llmapi/test_async_llm.py",
"docker/common/install_ucx.sh",
"docker/common/install_nixl.sh",

View File

@ -161,7 +161,7 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-Nano-3-30B-A3.5B-FP8-KVFP8-dev"
MODEL_PATH_NVFP4 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4"
def get_default_kwargs(self):
def get_default_kwargs(self, world_size=1):
return {
"skip_tokenizer_init": False,
"trust_remote_code": True,
@ -189,7 +189,8 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
},
"multi_stream_moe": {
"stage": "compile",
"enabled": True,
# multi-stream MOE currently does not work for world_size > 1
"enabled": world_size == 1,
},
}
}
@ -203,14 +204,16 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
use_beam_search=beam_width > 1)
@pytest.mark.skip_less_device_memory(32000)
def test_bf16(self):
kwargs = self.get_default_kwargs()
@pytest.mark.parametrize("world_size", [1, 4])
def test_bf16(self, world_size):
kwargs = self.get_default_kwargs(world_size=world_size)
# TODO: multi-stream MOE seems to increase the memory usage
kwargs["max_batch_size"] = 32
kwargs["kv_cache_config"] = {"free_gpu_memory_fraction": 0.4}
sampling_params = self.get_default_sampling_params()
with AutoDeployLLM(model=self.MODEL_PATH_BF16,
tokenizer=self.MODEL_PATH_BF16,
world_size=world_size,
**kwargs) as llm:
sampling_params = self.get_default_sampling_params()
task = MMLU(self.MODEL_NAME)
@ -219,11 +222,12 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
task.evaluate(llm)
@pytest.mark.skip_less_device_memory(32000)
def test_fp8(self):
kwargs = self.get_default_kwargs()
kwargs["max_batch_size"] = 64
@pytest.mark.parametrize("world_size", [1, 4])
def test_fp8(self, world_size):
kwargs = self.get_default_kwargs(world_size=world_size)
with AutoDeployLLM(model=self.MODEL_PATH_FP8,
tokenizer=self.MODEL_PATH_FP8,
world_size=world_size,
**kwargs) as llm:
# Manually set quant_config for FP8 model to get the accuracy threshold
llm.args.quant_config.quant_algo = QuantAlgo.FP8

View File

@ -174,5 +174,5 @@ l0_b200:
backend: autodeploy
tests:
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[1]
- unittest/_torch/auto_deploy/unit/singlegpu

View File

@ -215,6 +215,7 @@ l0_dgx_b200:
tests:
- unittest/_torch/auto_deploy/unit/multigpu
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[4]
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[8]

View File

@ -436,8 +436,8 @@ l0_h100:
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[True-1]
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[False]
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[1]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16[1]
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[draft_target]
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[eagle3]
- examples/test_ad_speculative_decoding.py::test_autodeploy_eagle3_acceptance_rate

View File

@ -548,7 +548,7 @@ def _run_pattern_detection_job(
fused_weight_dims=fused_weight_dims,
)
)
if is_op(node, torch.ops.auto_deploy.torch_causal_conv1d):
elif is_op(node, torch.ops.auto_deploy.torch_causal_conv1d):
expected_transformations.append(
WeightShardingInfo(
target_node=node.name,
@ -560,7 +560,7 @@ def _run_pattern_detection_job(
fused_weight_dims=(num_features, 16 * num_heads, 16 * num_heads),
)
)
if is_op(node, torch.ops.auto_deploy.torch_ssm):
elif is_op(node, torch.ops.auto_deploy.torch_ssm):
expected_transformations.append(
WeightShardingInfo(
target_node=node.name,
@ -572,7 +572,7 @@ def _run_pattern_detection_job(
fused_weight_dims=None,
)
)
if len(node.args) > 1 and (
elif len(node.args) > 1 and (
"norm_weight" in node.args[0].name or "a_log" in node.args[0].name
):
expected_transformations.append(