mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[TRTLLM-7261][feat] Support phi-4 model in pytorch backend (#7371)
Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
This commit is contained in:
parent
572551b586
commit
4223a9aada
@ -26,6 +26,7 @@ TensorRT-LLM optimizes the performance of a range of well-known models on NVIDIA
|
||||
| `MllamaForConditionalGeneration` | Llama 3.2 | `meta-llama/Llama-3.2-11B-Vision` | L |
|
||||
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base` | L |
|
||||
| `NemotronNASForCausalLM` | NemotronNAS | `nvidia/Llama-3_3-Nemotron-Super-49B-v1` | L |
|
||||
| `Phi3ForCausalLM` | Phi-4 | `microsoft/Phi-4` | L |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | `microsoft/Phi-4-multimodal-instruct` | L + I + A |
|
||||
| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/Qwen2-7B-Instruct` | L |
|
||||
| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B` | L |
|
||||
|
||||
@ -189,6 +189,11 @@ microsoft/Phi-4-multimodal-instruct-long-rope:
|
||||
- accuracy: 75.85
|
||||
microsoft/Phi-4-mini-instruct:
|
||||
- accuracy: 82.30
|
||||
microsoft/phi-4:
|
||||
- accuracy: 90.30
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 90.64
|
||||
mistralai/Codestral-22B-v0.1:
|
||||
- accuracy: 67.10
|
||||
GPT-OSS/BF16:
|
||||
|
||||
@ -293,6 +293,11 @@ microsoft/Phi-4-multimodal-instruct:
|
||||
- accuracy: 69.69
|
||||
microsoft/Phi-4-multimodal-instruct-long-rope:
|
||||
- accuracy: 65.98
|
||||
microsoft/phi-4:
|
||||
- accuracy: 79.73
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 79.36
|
||||
LGAI-EXAONE/EXAONE-4.0-32B:
|
||||
- accuracy: 78.52
|
||||
GPT-OSS/BF16:
|
||||
|
||||
@ -2791,6 +2791,25 @@ class TestBielik11BInstruct(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestPhi4(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "microsoft/phi-4"
|
||||
|
||||
def test_auto_dtype(self):
|
||||
with LLM(f"{llm_models_root()}/Phi-4") as llm:
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@skip_pre_hopper
|
||||
def test_fp8(self):
|
||||
with LLM(f"{llm_models_root()}/Phi-4-FP8") as llm:
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestPhi4MM(LlmapiAccuracyTestHarness):
|
||||
# phi4-mm can also support text input.
|
||||
MODEL_NAME = "microsoft/Phi-4-multimodal-instruct"
|
||||
|
||||
@ -603,6 +603,8 @@ accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen2_VL_7B::test_auto_dtype
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
|
||||
|
||||
@ -133,6 +133,8 @@ accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user