[TRTLLM-7261][feat] Support phi-4 model in pytorch backend (#7371)

Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-09-03 10:27:42 +08:00 · 2025-09-03 10:27:42 +08:00 · 4223a9aada
commit 4223a9aada
parent 572551b586
6 changed files with 34 additions and 0 deletions
--- a/docs/source/reference/support-matrix.md
+++ b/docs/source/reference/support-matrix.md
@ -26,6 +26,7 @@ TensorRT-LLM optimizes the performance of a range of well-known models on NVIDIA
 | `MllamaForConditionalGeneration` | Llama 3.2 | `meta-llama/Llama-3.2-11B-Vision` | L |
 | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base` | L |
 | `NemotronNASForCausalLM` | NemotronNAS | `nvidia/Llama-3_3-Nemotron-Super-49B-v1` | L |
+| `Phi3ForCausalLM` | Phi-4  | `microsoft/Phi-4` | L |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | `microsoft/Phi-4-multimodal-instruct` | L + I + A |
 | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/Qwen2-7B-Instruct` | L |
 | `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B` | L |
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@ -189,6 +189,11 @@ microsoft/Phi-4-multimodal-instruct-long-rope:
  - accuracy: 75.85
 microsoft/Phi-4-mini-instruct:
  - accuracy: 82.30
+microsoft/phi-4:
+  - accuracy: 90.30
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 90.64
 mistralai/Codestral-22B-v0.1:
  - accuracy: 67.10
 GPT-OSS/BF16:
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@ -293,6 +293,11 @@ microsoft/Phi-4-multimodal-instruct:
  - accuracy: 69.69
 microsoft/Phi-4-multimodal-instruct-long-rope:
  - accuracy: 65.98
+microsoft/phi-4:
+  - accuracy: 79.73
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 79.36
 LGAI-EXAONE/EXAONE-4.0-32B:
  - accuracy: 78.52
 GPT-OSS/BF16:
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -2791,6 +2791,25 @@ class TestBielik11BInstruct(LlmapiAccuracyTestHarness):
            task.evaluate(llm)


+class TestPhi4(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "microsoft/phi-4"
+
+    def test_auto_dtype(self):
+        with LLM(f"{llm_models_root()}/Phi-4") as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_hopper
+    def test_fp8(self):
+        with LLM(f"{llm_models_root()}/Phi-4-FP8") as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
 class TestPhi4MM(LlmapiAccuracyTestHarness):
    # phi4-mm can also support text input.
    MODEL_NAME = "microsoft/Phi-4-multimodal-instruct"
--- a/tests/integration/test_lists/qa/llm_function_full.txt
+++ b/tests/integration/test_lists/qa/llm_function_full.txt
@ -603,6 +603,8 @@ accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
 accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestQwen2_VL_7B::test_auto_dtype
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
--- a/tests/integration/test_lists/qa/llm_function_sanity.txt
+++ b/tests/integration/test_lists/qa/llm_function_sanity.txt
@ -133,6 +133,8 @@ accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
 accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
+accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]