From f167b1fd99d6c9761b1c4b91cc80009ee0973714 Mon Sep 17 00:00:00 2001 From: bhsueh_NV <11360707+byshiue@users.noreply.github.com> Date: Wed, 27 Aug 2025 15:26:10 +0800 Subject: [PATCH] [https://nvbugs/5453727][fix] Fix bug of how GPT-OSS setup the parameters in CI (#7151) Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_pytorch.py | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 03973b0f69..95c4298d5d 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2699,22 +2699,21 @@ class TestPhi4MM(LlmapiAccuracyTestHarness): class TestGPTOSS(LlmapiAccuracyTestHarness): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) + extra_evaluator_kwargs = { + "fewshot_as_multiturn": True, + "apply_chat_template": True, + "scores_filter": "exact_match,flexible-extract", + "MAX_OUTPUT_LEN": 8192 + } MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-120b" - def update_task_kwargs(self, task): - task.EVALUATOR_KWARGS["fewshot_as_multiturn"] = True - task.EVALUATOR_KWARGS["apply_chat_template"] = True - task.EVALUATE_KWARGS["scores_filter"] = "exact_match,flexible-extract" - task.MAX_OUTPUT_LEN = 8192 - return task - @pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM", "TRITON"], ids=["cutlass", "trtllm", "triton"]) @pytest.mark.parametrize("cuda_graph,overlap_scheduler", [ (True, True), ]) - def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler): + def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker): if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE: pytest.skip("Triton kernels are not available") @@ -2732,9 +2731,10 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): with llm: model_name = "GPT-OSS/MXFP4" + mocker.patch.object(GSM8K, {"MAX_OUTPUT_LEN": 8192}) task = GSM8K(model_name) - task = self.update_task_kwargs(task) - task.evaluate(llm) + task.evaluate(llm, + extra_evaluator_kwargs=self.extra_evaluator_kwargs) @pytest.mark.skip_less_device(4) @pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM", "TRITON"]) @@ -2746,7 +2746,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): ], ids=["tp4", "ep4", "dp4"]) def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size, - attention_dp, cuda_graph, overlap_scheduler): + attention_dp, cuda_graph, overlap_scheduler, mocker): if moe_backend == "TRITON": if not IS_TRITON_KERNELS_AVAILABLE: pytest.skip("Triton kernels are not available") @@ -2767,8 +2767,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): with llm: model_name = "GPT-OSS/MXFP4" task = GSM8K(model_name) - task = self.update_task_kwargs(task) - task.evaluate(llm) + mocker.patch.object(GSM8K, {"MAX_OUTPUT_LEN": 8192}) + task.evaluate(llm, + extra_evaluator_kwargs=self.extra_evaluator_kwargs) @pytest.mark.skip_less_device(4) @pytest.mark.parametrize( @@ -2777,7 +2778,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): ], ids=["dp4"]) def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, - overlap_scheduler, monkeypatch): + overlap_scheduler, monkeypatch, mocker): if not IS_TRITON_KERNELS_AVAILABLE: pytest.skip("Triton kernels are not available") monkeypatch.setenv("OVERRIDE_QUANT_ALGO", "W4A16_MXFP4") @@ -2797,8 +2798,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): with llm: model_name = "GPT-OSS/BF16" task = GSM8K(model_name) - task = self.update_task_kwargs(task) - task.evaluate(llm) + mocker.patch.object(GSM8K, {"MAX_OUTPUT_LEN": 8192}) + task.evaluate(llm, + extra_evaluator_kwargs=self.extra_evaluator_kwargs) class TestEXAONE4(LlmapiAccuracyTestHarness):