mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[https://nvbugs/5596343] [test] Update accuracy baseline for GPT-OSS-20B (#8842)
Signed-off-by: Dongfeng Yu <dongfengy@nvidia.com> Signed-off-by: dongfengy <99041270+dongfengy@users.noreply.github.com> Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
This commit is contained in:
parent
baa6ba0d69
commit
7d8a913406
@ -200,7 +200,7 @@ GPT-OSS/BF16:
|
||||
- accuracy: 90.3
|
||||
- kv_cache_quant_algo: FP8
|
||||
accuracy: 90.3
|
||||
GPT-OSS/MXFP4:
|
||||
GPT-OSS/120B-MXFP4:
|
||||
- accuracy: 90.3
|
||||
- quant_algo: W4A8_MXFP4_MXFP8
|
||||
accuracy: 90.3
|
||||
@ -217,5 +217,17 @@ GPT-OSS/MXFP4:
|
||||
- quant_algo: W4A16_MXFP4
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 90.3
|
||||
GPT-OSS/20B-MXFP4:
|
||||
- accuracy: 85.0
|
||||
- quant_algo: W4A8_MXFP4_MXFP8
|
||||
accuracy: 85.0
|
||||
- quant_algo: W4A8_MXFP4_MXFP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 85.0
|
||||
- quant_algo: W4A16_MXFP4
|
||||
accuracy: 85.0
|
||||
- quant_algo: W4A16_MXFP4
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 85.0
|
||||
LGAI-EXAONE/EXAONE-4.0-32B:
|
||||
- accuracy: 88.36
|
||||
|
||||
@ -3248,7 +3248,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
moe_config=MoeConfig(backend=moe_backend))
|
||||
|
||||
with llm:
|
||||
model_name = "GPT-OSS/MXFP4"
|
||||
model_name = "GPT-OSS/20B-MXFP4"
|
||||
task = GSM8K(model_name)
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
|
||||
@ -3296,7 +3296,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
moe_config=MoeConfig(backend=moe_backend))
|
||||
|
||||
with llm:
|
||||
model_name = "GPT-OSS/MXFP4"
|
||||
model_name = "GPT-OSS/120B-MXFP4"
|
||||
task = GSM8K(model_name)
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
|
||||
@ -3383,7 +3383,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
moe_config=MoeConfig(backend=moe_backend))
|
||||
|
||||
with llm:
|
||||
model_name = "GPT-OSS/MXFP4"
|
||||
model_name = "GPT-OSS/20B-MXFP4"
|
||||
task = GSM8K(model_name)
|
||||
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
|
||||
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
|
||||
@ -3410,7 +3410,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
|
||||
dtype=kv_cache_dtype)
|
||||
|
||||
model_name = "GPT-OSS/MXFP4"
|
||||
model_name = "GPT-OSS/120B-MXFP4"
|
||||
with LLM(self.MODEL_PATH,
|
||||
tensor_parallel_size=4,
|
||||
pipeline_parallel_size=1,
|
||||
|
||||
@ -340,7 +340,6 @@ triton_server/test_triton_llm.py::test_llmapi_backend[1-0-enableDecoupleMode-ten
|
||||
cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5601670)
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_empty_batch[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5601682)
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5587574)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-fp8] SKIP (https://nvbugs/5608790)
|
||||
full:H20-3e/accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (slow I/O)
|
||||
full:H20-3e/accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency] SKIP (slow I/O)
|
||||
full:H20-3e/test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[DeepSeek-V3-671B-FP8-DeepSeek-V3-0324-8] SKIP (slow I/O)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user