mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-04 18:21:52 +08:00
[TRTLLM-10362][feat] Added Mamba and MLA layers to the sharding tests (#10364)
Signed-off-by: greg-kwasniewski1 <213329731+greg-kwasniewski1@users.noreply.github.com> Signed-off-by: Grzegorz Kwasniewski <213329731+greg-kwasniewski1@users.noreply.github.com>
This commit is contained in:
parent
3e17ee4e38
commit
38bcee189c
@ -719,6 +719,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
|
||||
"tensorrt_llm/_torch/pyexecutor/_util.py",
|
||||
"tensorrt_llm/_torch/pyexecutor/model_engine.py",
|
||||
"tensorrt_llm/_torch/pyexecutor/py_executor.py",
|
||||
"tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py",
|
||||
"tensorrt_llm/evaluate/json_mode_eval.py",
|
||||
"tensorrt_llm/evaluate/mmlu.py",
|
||||
"tensorrt_llm/executor/",
|
||||
@ -740,6 +741,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
|
||||
"tests/integration/defs/accuracy/test_disaggregated_serving.py",
|
||||
"tests/unittest/_torch/ray_orchestrator/multi_gpu/",
|
||||
"tests/integration/defs/examples/test_ray.py",
|
||||
"tests/integration/defs/accuracy/test_llm_api_autodeploy.py",
|
||||
"tests/unittest/llmapi/test_async_llm.py",
|
||||
"docker/common/install_ucx.sh",
|
||||
"docker/common/install_nixl.sh",
|
||||
|
||||
@ -161,7 +161,7 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
|
||||
MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-Nano-3-30B-A3.5B-FP8-KVFP8-dev"
|
||||
MODEL_PATH_NVFP4 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4"
|
||||
|
||||
def get_default_kwargs(self):
|
||||
def get_default_kwargs(self, world_size=1):
|
||||
return {
|
||||
"skip_tokenizer_init": False,
|
||||
"trust_remote_code": True,
|
||||
@ -189,7 +189,8 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
|
||||
},
|
||||
"multi_stream_moe": {
|
||||
"stage": "compile",
|
||||
"enabled": True,
|
||||
# multi-stream MOE currently does not work for world_size > 1
|
||||
"enabled": world_size == 1,
|
||||
},
|
||||
}
|
||||
}
|
||||
@ -203,14 +204,16 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
|
||||
use_beam_search=beam_width > 1)
|
||||
|
||||
@pytest.mark.skip_less_device_memory(32000)
|
||||
def test_bf16(self):
|
||||
kwargs = self.get_default_kwargs()
|
||||
@pytest.mark.parametrize("world_size", [1, 4])
|
||||
def test_bf16(self, world_size):
|
||||
kwargs = self.get_default_kwargs(world_size=world_size)
|
||||
# TODO: multi-stream MOE seems to increase the memory usage
|
||||
kwargs["max_batch_size"] = 32
|
||||
kwargs["kv_cache_config"] = {"free_gpu_memory_fraction": 0.4}
|
||||
sampling_params = self.get_default_sampling_params()
|
||||
with AutoDeployLLM(model=self.MODEL_PATH_BF16,
|
||||
tokenizer=self.MODEL_PATH_BF16,
|
||||
world_size=world_size,
|
||||
**kwargs) as llm:
|
||||
sampling_params = self.get_default_sampling_params()
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
@ -219,11 +222,12 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip_less_device_memory(32000)
|
||||
def test_fp8(self):
|
||||
kwargs = self.get_default_kwargs()
|
||||
kwargs["max_batch_size"] = 64
|
||||
@pytest.mark.parametrize("world_size", [1, 4])
|
||||
def test_fp8(self, world_size):
|
||||
kwargs = self.get_default_kwargs(world_size=world_size)
|
||||
with AutoDeployLLM(model=self.MODEL_PATH_FP8,
|
||||
tokenizer=self.MODEL_PATH_FP8,
|
||||
world_size=world_size,
|
||||
**kwargs) as llm:
|
||||
# Manually set quant_config for FP8 model to get the accuracy threshold
|
||||
llm.args.quant_config.quant_algo = QuantAlgo.FP8
|
||||
|
||||
@ -174,5 +174,5 @@ l0_b200:
|
||||
backend: autodeploy
|
||||
tests:
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[1]
|
||||
- unittest/_torch/auto_deploy/unit/singlegpu
|
||||
|
||||
@ -215,6 +215,7 @@ l0_dgx_b200:
|
||||
tests:
|
||||
- unittest/_torch/auto_deploy/unit/multigpu
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[4]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[8]
|
||||
|
||||
@ -436,8 +436,8 @@ l0_h100:
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[True-1]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[False]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[1]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16[1]
|
||||
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[draft_target]
|
||||
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[eagle3]
|
||||
- examples/test_ad_speculative_decoding.py::test_autodeploy_eagle3_acceptance_rate
|
||||
|
||||
@ -548,7 +548,7 @@ def _run_pattern_detection_job(
|
||||
fused_weight_dims=fused_weight_dims,
|
||||
)
|
||||
)
|
||||
if is_op(node, torch.ops.auto_deploy.torch_causal_conv1d):
|
||||
elif is_op(node, torch.ops.auto_deploy.torch_causal_conv1d):
|
||||
expected_transformations.append(
|
||||
WeightShardingInfo(
|
||||
target_node=node.name,
|
||||
@ -560,7 +560,7 @@ def _run_pattern_detection_job(
|
||||
fused_weight_dims=(num_features, 16 * num_heads, 16 * num_heads),
|
||||
)
|
||||
)
|
||||
if is_op(node, torch.ops.auto_deploy.torch_ssm):
|
||||
elif is_op(node, torch.ops.auto_deploy.torch_ssm):
|
||||
expected_transformations.append(
|
||||
WeightShardingInfo(
|
||||
target_node=node.name,
|
||||
@ -572,7 +572,7 @@ def _run_pattern_detection_job(
|
||||
fused_weight_dims=None,
|
||||
)
|
||||
)
|
||||
if len(node.args) > 1 and (
|
||||
elif len(node.args) > 1 and (
|
||||
"norm_weight" in node.args[0].name or "a_log" in node.args[0].name
|
||||
):
|
||||
expected_transformations.append(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user