From 2e61526d12bb82ff94341b006361f420c6074382 Mon Sep 17 00:00:00 2001
From: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
Date: Wed, 10 Sep 2025 10:34:18 +0800
Subject: [PATCH] fix

Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
---
 jenkins/Build.groovy                                 |  4 ++--
 .../_torch/auto_deploy/models/patches/mistral3.py    |  2 +-
 .../unit/singlegpu/test_ad_build_small_single.py     | 12 +++++++-----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy
index 4c27ddb115..870792504b 100644
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@@ -573,8 +573,8 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
         "Build TRT-LLM": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
             pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64 : CONFIG_LINUX_X86_64_VANILLA),
         // Disable CUDA12 build for too slow to build (cost > 5 hours on SBSA)
-        // "Build TRT-LLM CUDA12": [LLM_DOCKER_IMAGE_CU12] + prepareLLMBuild(
-        //     pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_CU12 : CONFIG_LINUX_X86_64_VANILLA_CU12),
+        "Build TRT-LLM CUDA12": [LLM_DOCKER_IMAGE_CU12] + prepareLLMBuild(
+            pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_CU12 : CONFIG_LINUX_X86_64_VANILLA_CU12),
         "Build TRT-LLM LLVM": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
             pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_LLVM : CONFIG_LINUX_X86_64_LLVM),
         "Build TRT-LLM Pybind": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/mistral3.py b/tensorrt_llm/_torch/auto_deploy/models/patches/mistral3.py
index 44a21770fd..5b61b5a417 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/patches/mistral3.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/patches/mistral3.py
@@ -91,7 +91,7 @@ def _mistral_forward(
         pixel_values: torch.Tensor,
         image_sizes: Optional[torch.Tensor],
     ):
-        return inputs_embeds
+        return inputs_embeds.clone()
 
     def _vision_branch(
         # ! The type annotations in the original transformers code are all wrong.
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py
index 22e4b047b6..22af97d219 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py
@@ -65,11 +65,11 @@ def _check_ad_config(experiment_config: ExperimentConfig, llm_args: LlmArgs):
             compile_backend="torch-simple",
         ),
         # disabled due to https://nvbugspro.nvidia.com/bug/5505835
-        # get_small_model_config(
-        #     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-        #     attn_backend="flashinfer",
-        #     compile_backend="torch-simple",
-        # ),
+        get_small_model_config(
+            "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+            attn_backend="flashinfer",
+            compile_backend="torch-simple",
+        ),
         get_small_model_config(
             "deepseek-ai/DeepSeek-V3",
             attn_backend="triton",
@@ -97,6 +97,8 @@ def _check_ad_config(experiment_config: ExperimentConfig, llm_args: LlmArgs):
     ],
 )
 def test_build_ad(experiment_config: Dict):
+    if "Llama-4" in experiment_config["args"]["model"]:
+        pytest.skip("https://nvbugspro.nvidia.com/bug/5505835")
     experiment_config["args"]["runtime"] = "demollm"  # Default runtime set to demollm
     experiment_config["args"]["world_size"] = 0  # Default world_size set to 0
     experiment_config = ExperimentConfig(**experiment_config)