mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[TRTLLM-6541][test] Add NIM Related Cases Part 1 (#6684)
Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
This commit is contained in:
parent
daa2a65d37
commit
bff5fdf6df
@ -45,6 +45,8 @@ microsoft/Phi-3.5-mini-instruct:
|
||||
- accuracy: 31.354
|
||||
microsoft/Phi-4-mini-instruct:
|
||||
- accuracy: 32.921
|
||||
- quant_algo: FP8
|
||||
accuracy: 32.823
|
||||
bigcode/starcoder2-7b:
|
||||
- accuracy: 26.611
|
||||
- quant_algo: FP8
|
||||
@ -132,6 +134,8 @@ meta-llama/Llama-3.1-8B-Instruct:
|
||||
- accuracy: 33.640
|
||||
- spec_dec_algo: Eagle
|
||||
accuracy: 33.640
|
||||
- extra_acc_spec: logprobs=2
|
||||
accuracy: 30.522
|
||||
- quant_algo: FP8
|
||||
accuracy: 33.841
|
||||
- quant_algo: FP8
|
||||
@ -207,7 +211,8 @@ mistralai/Mistral-7B-Instruct-v0.3:
|
||||
accuracy: 31.201
|
||||
mistralai/Mistral-Small-3.1-24B-Instruct-2503:
|
||||
- accuracy: 29.20
|
||||
mistralai/Mistral-Nemo-Base-2407:
|
||||
mistralai/Mistral-Nemo-12b-Base:
|
||||
- accuracy: 28.906
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 24.0
|
||||
|
||||
@ -232,6 +232,8 @@ nvidia/Nemotron-H-56B-Base-8K:
|
||||
accuracy: 83.82
|
||||
microsoft/Phi-4-mini-instruct:
|
||||
- accuracy: 68.98
|
||||
- quant_algo: FP8
|
||||
accuracy: 68.30
|
||||
bigcode/starcoder2-7b:
|
||||
- accuracy: 41.35
|
||||
- quant_algo: FP8
|
||||
@ -275,3 +277,7 @@ GPT-OSS/MXFP4:
|
||||
accuracy: 75.50
|
||||
- quant_algo: W4A8_MXFP4_FP8
|
||||
accuracy: 75.50
|
||||
mistralai/Mistral-Nemo-12b-Base:
|
||||
- accuracy: 69.66
|
||||
- quant_algo: FP8
|
||||
accuracy: 69.66
|
||||
|
||||
@ -15,7 +15,9 @@
|
||||
import pytest
|
||||
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
|
||||
from tensorrt_llm.llmapi import (EagleDecodingConfig,
|
||||
ExtendedRuntimePerfKnobConfig, KvCacheConfig,
|
||||
SamplingParams)
|
||||
from tensorrt_llm.models.modeling_utils import QuantConfig
|
||||
from tensorrt_llm.quantization import QuantAlgo
|
||||
|
||||
@ -76,6 +78,27 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
|
||||
task = JsonModeEval(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
def test_gather_generation_logits_cuda_graph(self):
|
||||
"""RCCA: https://nvbugs/5365525"""
|
||||
extended_runtime_perf_knob_config = ExtendedRuntimePerfKnobConfig(
|
||||
cuda_graph_mode=True, cuda_graph_cache_size=1)
|
||||
llm = LLM(
|
||||
self.MODEL_PATH,
|
||||
gather_generation_logits=True,
|
||||
extended_runtime_perf_knob_config=extended_runtime_perf_knob_config)
|
||||
with llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
def test_logprobs(self):
|
||||
sampling_config = SamplingParams(logprobs=2)
|
||||
llm = LLM(self.MODEL_PATH, gather_generation_logits=True)
|
||||
with llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm,
|
||||
sampling_params=sampling_config,
|
||||
extra_acc_spec="logprobs=2")
|
||||
|
||||
|
||||
class TestLlama3_2_1B(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "meta-llama/Llama-3.2-1B"
|
||||
@ -177,18 +200,49 @@ class TestMistral7B_0_3(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestMistral_Nemo_12B_Base(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "mistralai/Mistral-Nemo-Base-2407"
|
||||
class TestMistralNemo12B(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "mistralai/Mistral-Nemo-12b-Base"
|
||||
MODEL_PATH = f"{llm_models_root()}/Mistral-Nemo-Base-2407"
|
||||
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
def test_auto_dtype(self):
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
|
||||
with LLM(self.MODEL_PATH,
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_batch_size=8) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
def test_auto_dtype_tp2(self):
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
|
||||
with LLM(self.MODEL_PATH,
|
||||
kv_cache_config=kv_cache_config,
|
||||
tensor_parallel_size=2,
|
||||
max_batch_size=8) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
@skip_pre_ada
|
||||
def test_fp8(self):
|
||||
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
|
||||
quant_config = QuantConfig(QuantAlgo.FP8,
|
||||
kv_cache_quant_algo=QuantAlgo.FP8)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
|
||||
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
|
||||
with LLM(self.MODEL_PATH,
|
||||
quant_config=quant_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_batch_size=8) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestMistral_NeMo_Minitron_8B_Instruct(LlmapiAccuracyTestHarness):
|
||||
@ -244,6 +298,27 @@ class TestMixtral8x7BInstruct(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "microsoft/Phi-4-mini-instruct"
|
||||
MODEL_PATH = f"{llm_models_root()}/Phi-4-mini-instruct"
|
||||
|
||||
def test_auto_dtype(self):
|
||||
with LLM(self.MODEL_PATH) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@skip_pre_ada
|
||||
def test_fp8(self):
|
||||
quant_config = QuantConfig(QuantAlgo.FP8)
|
||||
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
|
||||
MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct"
|
||||
@ -378,7 +453,7 @@ class TestQwen2_5_7BInstruct(LlmapiAccuracyTestHarness):
|
||||
@skip_pre_ada
|
||||
def test_fp8_kvcache(self):
|
||||
"RCCA: https://nvbugs/5065080"
|
||||
quant_config = QuantConfig(QuantAlgo.FP8,
|
||||
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
|
||||
kv_cache_quant_algo=QuantAlgo.FP8)
|
||||
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
|
||||
@ -1773,6 +1773,36 @@ class TestNemotronNas(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestMistralNemo12B(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "mistralai/Mistral-Nemo-12b-Base"
|
||||
MODEL_PATH = f"{llm_models_root()}/Mistral-Nemo-Base-2407"
|
||||
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
def test_auto_dtype(self):
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
|
||||
with LLM(self.MODEL_PATH,
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_batch_size=8) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip_less_device(2)
|
||||
def test_auto_dtype_tp2(self):
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
|
||||
|
||||
with LLM(self.MODEL_PATH,
|
||||
kv_cache_config=kv_cache_config,
|
||||
tensor_parallel_size=2,
|
||||
max_batch_size=8) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
@pytest.mark.timeout(5400)
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
class TestLlama3_3NemotronSuper49Bv1(LlmapiAccuracyTestHarness):
|
||||
|
||||
@ -420,6 +420,10 @@ accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp4
|
||||
accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp2pp2
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_logprobs
|
||||
accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8
|
||||
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_weight_only
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise
|
||||
@ -431,7 +435,9 @@ accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8_kvcache
|
||||
accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4]
|
||||
accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4_awq]
|
||||
accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int8_awq]
|
||||
accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8
|
||||
accuracy/test_llm_api.py::TestMistralNemo12B::test_auto_dtype
|
||||
accuracy/test_llm_api.py::TestMistralNemo12B::test_auto_dtype_tp2
|
||||
accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8
|
||||
accuracy/test_llm_api.py::TestMistral_NeMo_Minitron_8B_Instruct::test_fp8
|
||||
accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2
|
||||
accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
|
||||
@ -576,6 +582,8 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2
|
||||
|
||||
test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
|
||||
test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
|
||||
|
||||
@ -253,7 +253,7 @@ l0_h100:
|
||||
- examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1]
|
||||
- examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] # 5 mins
|
||||
- accuracy/test_llm_api.py::TestMistral_NeMo_Minitron_8B_Instruct::test_fp8
|
||||
- accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8
|
||||
- accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] # 7 mins
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
|
||||
@ -300,7 +300,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5457489)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5457489)
|
||||
disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5457504)
|
||||
accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8 SKIP (https://nvbugs/5413197)
|
||||
accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 SKIP (https://nvbugs/5413197)
|
||||
triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349)
|
||||
triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624)
|
||||
triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343)
|
||||
@ -314,3 +314,5 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5459817)
|
||||
llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5461796)
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5459811)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437384)
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5365525)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user