From f167b1fd99d6c9761b1c4b91cc80009ee0973714 Mon Sep 17 00:00:00 2001
From: bhsueh_NV <11360707+byshiue@users.noreply.github.com>
Date: Wed, 27 Aug 2025 15:26:10 +0800
Subject: [PATCH] [https://nvbugs/5453727][fix] Fix bug of how GPT-OSS setup
 the parameters in CI (#7151)

Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>
---
 .../defs/accuracy/test_llm_api_pytorch.py     | 34 ++++++++++---------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 03973b0f69..95c4298d5d 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2699,22 +2699,21 @@ class TestPhi4MM(LlmapiAccuracyTestHarness):
 
 class TestGPTOSS(LlmapiAccuracyTestHarness):
     kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
+    extra_evaluator_kwargs = {
+        "fewshot_as_multiturn": True,
+        "apply_chat_template": True,
+        "scores_filter": "exact_match,flexible-extract",
+        "MAX_OUTPUT_LEN": 8192
+    }
 
     MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-120b"
 
-    def update_task_kwargs(self, task):
-        task.EVALUATOR_KWARGS["fewshot_as_multiturn"] = True
-        task.EVALUATOR_KWARGS["apply_chat_template"] = True
-        task.EVALUATE_KWARGS["scores_filter"] = "exact_match,flexible-extract"
-        task.MAX_OUTPUT_LEN = 8192
-        return task
-
     @pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM", "TRITON"],
                              ids=["cutlass", "trtllm", "triton"])
     @pytest.mark.parametrize("cuda_graph,overlap_scheduler", [
         (True, True),
     ])
-    def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler):
+    def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
         if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE:
             pytest.skip("Triton kernels are not available")
 
@@ -2732,9 +2731,10 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
 
         with llm:
             model_name = "GPT-OSS/MXFP4"
+            mocker.patch.object(GSM8K, {"MAX_OUTPUT_LEN": 8192})
             task = GSM8K(model_name)
-            task = self.update_task_kwargs(task)
-            task.evaluate(llm)
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=self.extra_evaluator_kwargs)
 
     @pytest.mark.skip_less_device(4)
     @pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM", "TRITON"])
@@ -2746,7 +2746,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
         ],
         ids=["tp4", "ep4", "dp4"])
     def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
-                      attention_dp, cuda_graph, overlap_scheduler):
+                      attention_dp, cuda_graph, overlap_scheduler, mocker):
         if moe_backend == "TRITON":
             if not IS_TRITON_KERNELS_AVAILABLE:
                 pytest.skip("Triton kernels are not available")
@@ -2767,8 +2767,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
         with llm:
             model_name = "GPT-OSS/MXFP4"
             task = GSM8K(model_name)
-            task = self.update_task_kwargs(task)
-            task.evaluate(llm)
+            mocker.patch.object(GSM8K, {"MAX_OUTPUT_LEN": 8192})
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=self.extra_evaluator_kwargs)
 
     @pytest.mark.skip_less_device(4)
     @pytest.mark.parametrize(
@@ -2777,7 +2778,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
         ],
         ids=["dp4"])
     def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
-                   overlap_scheduler, monkeypatch):
+                   overlap_scheduler, monkeypatch, mocker):
         if not IS_TRITON_KERNELS_AVAILABLE:
             pytest.skip("Triton kernels are not available")
         monkeypatch.setenv("OVERRIDE_QUANT_ALGO", "W4A16_MXFP4")
@@ -2797,8 +2798,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
         with llm:
             model_name = "GPT-OSS/BF16"
             task = GSM8K(model_name)
-            task = self.update_task_kwargs(task)
-            task.evaluate(llm)
+            mocker.patch.object(GSM8K, {"MAX_OUTPUT_LEN": 8192})
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=self.extra_evaluator_kwargs)
 
 
 class TestEXAONE4(LlmapiAccuracyTestHarness):