mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[TRTLLM-6541][test] Add NIM Related Cases [StarCoder2_7B] and [Codestral_22B_V01] (#6939)
Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
This commit is contained in:
parent
816a120af6
commit
e90280a84d
@ -45,6 +45,14 @@ microsoft/Phi-3.5-mini-instruct:
|
||||
- accuracy: 31.354
|
||||
microsoft/Phi-4-mini-instruct:
|
||||
- accuracy: 32.921
|
||||
bigcode/starcoder2-7b:
|
||||
- accuracy: 26.611
|
||||
- quant_algo: FP8
|
||||
accuracy: 26.611
|
||||
mistralai/Codestral-22B-v0.1:
|
||||
- accuracy: 30.316
|
||||
- quant_algo: FP8
|
||||
accuracy: 30.316
|
||||
state-spaces/mamba-130m-hf:
|
||||
- accuracy: 19.470
|
||||
lmsys/vicuna-7b-v1.3:
|
||||
|
||||
@ -162,6 +162,8 @@ microsoft/Phi-4-multimodal-instruct-long-rope:
|
||||
- accuracy: 75.85
|
||||
microsoft/Phi-4-mini-instruct:
|
||||
- accuracy: 82.30
|
||||
mistralai/Codestral-22B-v0.1:
|
||||
- accuracy: 67.10
|
||||
GPT-OSS/BF16:
|
||||
- accuracy: 90.3
|
||||
GPT-OSS/MXFP4:
|
||||
|
||||
@ -232,6 +232,14 @@ nvidia/Nemotron-H-56B-Base-8K:
|
||||
accuracy: 83.82
|
||||
microsoft/Phi-4-mini-instruct:
|
||||
- accuracy: 68.98
|
||||
bigcode/starcoder2-7b:
|
||||
- accuracy: 41.35
|
||||
- quant_algo: FP8
|
||||
accuracy: 41.35
|
||||
mistralai/Codestral-22B-v0.1:
|
||||
- accuracy: 61.72
|
||||
- quant_algo: FP8
|
||||
accuracy: 61.72
|
||||
# Created a dummy accuracy to track tp_size=2 for phi4-mini model.
|
||||
# TODO: update once https://nvbugs/5393849 is fixed.
|
||||
microsoft/Phi-4-mini-instruct-tp2:
|
||||
|
||||
@ -433,3 +433,55 @@ class TestEagle2Vicuna_7B_v1_3(LlmapiAccuracyTestHarness):
|
||||
speculative_config=self.speculative_config) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestStarCoder2_7B(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "bigcode/starcoder2-7b"
|
||||
MODEL_PATH = f"{llm_models_root()}/starcoder2-7b"
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
|
||||
|
||||
@pytest.mark.skip_less_device_memory(70000)
|
||||
def test_auto_dtype(self):
|
||||
with LLM(self.MODEL_PATH, kv_cache_config=self.kv_cache_config) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@skip_pre_ada
|
||||
@pytest.mark.skip_less_device_memory(70000)
|
||||
def test_fp8(self):
|
||||
quant_config = QuantConfig(QuantAlgo.FP8)
|
||||
with LLM(self.MODEL_PATH,
|
||||
quant_config=quant_config,
|
||||
kv_cache_config=self.kv_cache_config) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestCodestral_22B_V01(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "mistralai/Codestral-22B-v0.1"
|
||||
MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1"
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
|
||||
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
def test_auto_dtype(self):
|
||||
with LLM(self.MODEL_PATH, kv_cache_config=self.kv_cache_config) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@skip_pre_ada
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
def test_fp8(self):
|
||||
quant_config = QuantConfig(QuantAlgo.FP8)
|
||||
with LLM(self.MODEL_PATH,
|
||||
quant_config=quant_config,
|
||||
kv_cache_config=self.kv_cache_config) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@ -2438,6 +2438,22 @@ class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestCodestral_22B_V01(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "mistralai/Codestral-22B-v0.1"
|
||||
MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1"
|
||||
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
def test_auto_dtype(self):
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
|
||||
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestKanana_Instruct(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "kanana-1.5-2.1b-instruct-2505"
|
||||
MODEL_PATH = f"{llm_models_root()}/kanana-1.5-2.1b-instruct-2505"
|
||||
|
||||
@ -21,3 +21,8 @@ accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cu
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
|
||||
accuracy/test_llm_api.py::TestStarCoder2_7B::test_auto_dtype
|
||||
accuracy/test_llm_api.py::TestStarCoder2_7B::test_fp8
|
||||
accuracy/test_llm_api.py::TestCodestral_22B_V01::test_auto_dtype
|
||||
accuracy/test_llm_api.py::TestCodestral_22B_V01::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestCodestral_22B_V01::test_auto_dtype
|
||||
|
||||
Loading…
Reference in New Issue
Block a user