mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[https://nvbugs/5453727][fix] Fix bug of how GPT-OSS setup the parameters in CI (#7151)
Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>
This commit is contained in:
parent
e08c7cf17b
commit
f167b1fd99
@ -2699,22 +2699,21 @@ class TestPhi4MM(LlmapiAccuracyTestHarness):
|
||||
|
||||
class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
|
||||
extra_evaluator_kwargs = {
|
||||
"fewshot_as_multiturn": True,
|
||||
"apply_chat_template": True,
|
||||
"scores_filter": "exact_match,flexible-extract",
|
||||
"MAX_OUTPUT_LEN": 8192
|
||||
}
|
||||
|
||||
MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-120b"
|
||||
|
||||
def update_task_kwargs(self, task):
|
||||
task.EVALUATOR_KWARGS["fewshot_as_multiturn"] = True
|
||||
task.EVALUATOR_KWARGS["apply_chat_template"] = True
|
||||
task.EVALUATE_KWARGS["scores_filter"] = "exact_match,flexible-extract"
|
||||
task.MAX_OUTPUT_LEN = 8192
|
||||
return task
|
||||
|
||||
@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM", "TRITON"],
|
||||
ids=["cutlass", "trtllm", "triton"])
|
||||
@pytest.mark.parametrize("cuda_graph,overlap_scheduler", [
|
||||
(True, True),
|
||||
])
|
||||
def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler):
|
||||
def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
|
||||
if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE:
|
||||
pytest.skip("Triton kernels are not available")
|
||||
|
||||
@ -2732,9 +2731,10 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
|
||||
with llm:
|
||||
model_name = "GPT-OSS/MXFP4"
|
||||
mocker.patch.object(GSM8K, {"MAX_OUTPUT_LEN": 8192})
|
||||
task = GSM8K(model_name)
|
||||
task = self.update_task_kwargs(task)
|
||||
task.evaluate(llm)
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
|
||||
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM", "TRITON"])
|
||||
@ -2746,7 +2746,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
],
|
||||
ids=["tp4", "ep4", "dp4"])
|
||||
def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
|
||||
attention_dp, cuda_graph, overlap_scheduler):
|
||||
attention_dp, cuda_graph, overlap_scheduler, mocker):
|
||||
if moe_backend == "TRITON":
|
||||
if not IS_TRITON_KERNELS_AVAILABLE:
|
||||
pytest.skip("Triton kernels are not available")
|
||||
@ -2767,8 +2767,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
with llm:
|
||||
model_name = "GPT-OSS/MXFP4"
|
||||
task = GSM8K(model_name)
|
||||
task = self.update_task_kwargs(task)
|
||||
task.evaluate(llm)
|
||||
mocker.patch.object(GSM8K, {"MAX_OUTPUT_LEN": 8192})
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
|
||||
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@pytest.mark.parametrize(
|
||||
@ -2777,7 +2778,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
],
|
||||
ids=["dp4"])
|
||||
def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
|
||||
overlap_scheduler, monkeypatch):
|
||||
overlap_scheduler, monkeypatch, mocker):
|
||||
if not IS_TRITON_KERNELS_AVAILABLE:
|
||||
pytest.skip("Triton kernels are not available")
|
||||
monkeypatch.setenv("OVERRIDE_QUANT_ALGO", "W4A16_MXFP4")
|
||||
@ -2797,8 +2798,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
with llm:
|
||||
model_name = "GPT-OSS/BF16"
|
||||
task = GSM8K(model_name)
|
||||
task = self.update_task_kwargs(task)
|
||||
task.evaluate(llm)
|
||||
mocker.patch.object(GSM8K, {"MAX_OUTPUT_LEN": 8192})
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
|
||||
|
||||
|
||||
class TestEXAONE4(LlmapiAccuracyTestHarness):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user