From 2e61526d12bb82ff94341b006361f420c6074382 Mon Sep 17 00:00:00 2001 From: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com> Date: Wed, 10 Sep 2025 10:34:18 +0800 Subject: [PATCH] fix Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com> --- jenkins/Build.groovy | 4 ++-- .../_torch/auto_deploy/models/patches/mistral3.py | 2 +- .../unit/singlegpu/test_ad_build_small_single.py | 12 +++++++----- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy index 4c27ddb115..870792504b 100644 --- a/jenkins/Build.groovy +++ b/jenkins/Build.groovy @@ -573,8 +573,8 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars) "Build TRT-LLM": [LLM_DOCKER_IMAGE] + prepareLLMBuild( pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64 : CONFIG_LINUX_X86_64_VANILLA), // Disable CUDA12 build for too slow to build (cost > 5 hours on SBSA) - // "Build TRT-LLM CUDA12": [LLM_DOCKER_IMAGE_CU12] + prepareLLMBuild( - // pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_CU12 : CONFIG_LINUX_X86_64_VANILLA_CU12), + "Build TRT-LLM CUDA12": [LLM_DOCKER_IMAGE_CU12] + prepareLLMBuild( + pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_CU12 : CONFIG_LINUX_X86_64_VANILLA_CU12), "Build TRT-LLM LLVM": [LLM_DOCKER_IMAGE] + prepareLLMBuild( pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_LLVM : CONFIG_LINUX_X86_64_LLVM), "Build TRT-LLM Pybind": [LLM_DOCKER_IMAGE] + prepareLLMBuild( diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/mistral3.py b/tensorrt_llm/_torch/auto_deploy/models/patches/mistral3.py index 44a21770fd..5b61b5a417 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/patches/mistral3.py +++ b/tensorrt_llm/_torch/auto_deploy/models/patches/mistral3.py @@ -91,7 +91,7 @@ def _mistral_forward( pixel_values: torch.Tensor, image_sizes: Optional[torch.Tensor], ): - return inputs_embeds + return inputs_embeds.clone() def _vision_branch( # ! The type annotations in the original transformers code are all wrong. diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py index 22e4b047b6..22af97d219 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py @@ -65,11 +65,11 @@ def _check_ad_config(experiment_config: ExperimentConfig, llm_args: LlmArgs): compile_backend="torch-simple", ), # disabled due to https://nvbugspro.nvidia.com/bug/5505835 - # get_small_model_config( - # "meta-llama/Llama-4-Scout-17B-16E-Instruct", - # attn_backend="flashinfer", - # compile_backend="torch-simple", - # ), + get_small_model_config( + "meta-llama/Llama-4-Scout-17B-16E-Instruct", + attn_backend="flashinfer", + compile_backend="torch-simple", + ), get_small_model_config( "deepseek-ai/DeepSeek-V3", attn_backend="triton", @@ -97,6 +97,8 @@ def _check_ad_config(experiment_config: ExperimentConfig, llm_args: LlmArgs): ], ) def test_build_ad(experiment_config: Dict): + if "Llama-4" in experiment_config["args"]["model"]: + pytest.skip("https://nvbugspro.nvidia.com/bug/5505835") experiment_config["args"]["runtime"] = "demollm" # Default runtime set to demollm experiment_config["args"]["world_size"] = 0 # Default world_size set to 0 experiment_config = ExperimentConfig(**experiment_config)