[None][test] Add post merge test for Seed-OSS-36B-Instruct (#8321)

Signed-off-by: Zhen Huang <145532724+zhhuang-nv@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-10-17 17:30:33 +08:00 · 2025-10-17 17:30:33 +08:00 · 7a2bab93f0
commit 7a2bab93f0
parent e72ade33c2
9 changed files with 109 additions and 15 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -2503,7 +2503,8 @@ def launchTestJobs(pipeline, testFilter)
        // "H100_PCIe-TensorRT-Post-Merge-5": ["h100-cr", "l0_h100", 5, 5],
        "H100_PCIe-FMHA-Post-Merge-1": ["h100-cr", "l0_h100", 1, 1],
        "B200_PCIe-Triton-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
-        "B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
+        "B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2],
+        "B200_PCIe-PyTorch-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2],
        // "B200_PCIe-TensorRT-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2],
        // "B200_PCIe-TensorRT-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2],
        "H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1],
--- a/tensorrt_llm/evaluate/interface.py
+++ b/tensorrt_llm/evaluate/interface.py
@ -34,13 +34,15 @@ class Evaluator(ABC):
                 random_seed: int = 0,
                 apply_chat_template: bool = False,
                 fewshot_as_multiturn: bool = False,
-                 system_prompt: Optional[str] = None):
+                 system_prompt: Optional[str] = None,
+                 chat_template_kwargs: Optional[dict[str, Any]] = None):
        random.seed(random_seed)
        np.random.seed(random_seed)
        torch.manual_seed(random_seed)
        self.apply_chat_template = apply_chat_template
        self.fewshot_as_multiturn = fewshot_as_multiturn
        self.system_prompt = system_prompt
+        self.chat_template_kwargs = chat_template_kwargs

    @abstractmethod
    def generate_samples(self) -> Iterable[tuple]:
@ -64,7 +66,9 @@ class Evaluator(ABC):
            }] + messages
        return llm.tokenizer.apply_chat_template(messages,
                                                 tokenize=False,
-                                                 add_generation_prompt=True)
+                                                 add_generation_prompt=True,
+                                                 **(self.chat_template_kwargs
+                                                    or {}))

    def _get_sampline_params(self, sampling_params: Optional[SamplingParams],
                             sampling_args: Optional[dict]) -> SamplingParams:
--- a/tensorrt_llm/evaluate/lm_eval.py
+++ b/tensorrt_llm/evaluate/lm_eval.py
@ -16,7 +16,7 @@ import copy
 import json
 import os
 from contextlib import contextmanager
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

 import click
 import numpy as np
@ -51,11 +51,13 @@ class LmEvalWrapper(TemplateLM):
    def __init__(self,
                 llm: Union[LLM, PyTorchLLM],
                 sampling_params: Optional[SamplingParams] = None,
-                 streaming: bool = False):
+                 streaming: bool = False,
+                 chat_template_kwargs: Optional[dict[str, Any]] = None):
        super().__init__()
        self.llm = llm
        self.sampling_params = sampling_params
        self.streaming = streaming
+        self.chat_template_kwargs = chat_template_kwargs

    @property
    def eot_token_id(self) -> int:
@ -72,6 +74,7 @@ class LmEvalWrapper(TemplateLM):
            tokenize=False,
            add_generation_prompt=add_generation_prompt,
            continue_final_message=not add_generation_prompt,
+            **(self.chat_template_kwargs or {}),
        )

    @property
@ -146,7 +149,8 @@ class MultimodalLmEvalWrapper(LmEvalWrapper):
                 llm: Union[LLM, PyTorchLLM],
                 sampling_params: Optional[SamplingParams] = None,
                 streaming: bool = False,
-                 max_images: int = 999):
+                 max_images: int = 999,
+                 chat_template_kwargs: Optional[dict[str, Any]] = None):
        """
        Initialize the multimodal wrapper.

@ -161,6 +165,7 @@ class MultimodalLmEvalWrapper(LmEvalWrapper):
        # NOTE: Required by lm_eval to identify this as a multimodal model
        self.MULTIMODAL = True
        self.max_images = max_images
+        self.chat_template_kwargs = chat_template_kwargs
        self.model_type = self._get_model_type(llm)

        # NOTE: In TRT-LLM, currently we do not support interleaved text and image. Instead, we are adding image placeholders at the end of the text or at the beginning of the text.
@ -237,7 +242,9 @@ class MultimodalLmEvalWrapper(LmEvalWrapper):
            mm_placeholder_counts=mm_placeholder_counts,
            tools=None,
            chat_template_kwargs={
-                "continue_final_message": not add_generation_prompt
+                **(self.chat_template_kwargs or {}),
+                "continue_final_message":
+                not add_generation_prompt,
            })
        return output

@ -301,7 +308,8 @@ class LmEvalEvaluator(Evaluator):
                 apply_chat_template: bool = False,
                 fewshot_as_multiturn: bool = False,
                 system_prompt: Optional[str] = None,
-                 is_multimodal: bool = False):
+                 is_multimodal: bool = False,
+                 chat_template_kwargs: Optional[dict[str, Any]] = None):
        try:
            import lm_eval
        except ImportError as e:
@ -319,7 +327,8 @@ class LmEvalEvaluator(Evaluator):
        super().__init__(random_seed=random_seed,
                         apply_chat_template=apply_chat_template,
                         fewshot_as_multiturn=fewshot_as_multiturn,
-                         system_prompt=system_prompt)
+                         system_prompt=system_prompt,
+                         chat_template_kwargs=chat_template_kwargs)
        self.task_name = task_name
        self.dataset_path = dataset_path
        self.num_samples = num_samples
@ -390,7 +399,10 @@ class LmEvalEvaluator(Evaluator):
        import lm_eval
        lm_cls = MultimodalLmEvalWrapper if self.MULTIMODAL else LmEvalWrapper
        results = lm_eval.evaluate(
-            lm=lm_cls(llm, sampling_params, streaming),
+            lm=lm_cls(llm,
+                      sampling_params=sampling_params,
+                      streaming=streaming,
+                      chat_template_kwargs=self.chat_template_kwargs),
            task_dict=self.task_dict,
            limit=self.num_samples,
            apply_chat_template=self.apply_chat_template,
@ -428,7 +440,9 @@ class LmEvalEvaluator(Evaluator):
                        fewshot_as_multiturn=kwargs.pop("fewshot_as_multiturn",
                                                        False),
                        system_prompt=kwargs.pop("system_prompt", None),
-                        is_multimodal=kwargs.pop("is_multimodal", False))
+                        is_multimodal=kwargs.pop("is_multimodal", False),
+                        chat_template_kwargs=kwargs.pop("chat_template_kwargs",
+                                                        None))
        sampling_params = SamplingParams(
            max_tokens=kwargs.pop("max_output_length"),
            truncate_prompt_tokens=kwargs.pop("max_input_length"),
@ -462,6 +476,13 @@ class GSM8K(LmEvalEvaluator):
                  is_flag=True,
                  default=False,
                  help="Whether to apply chat template.")
+    @click.option(
+        "--chat_template_kwargs",
+        type=str,
+        default=None,
+        callback=lambda ctx, param, value: json.loads(value) if value else None,
+        help=
+        'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
    @click.option("--fewshot_as_multiturn",
                  is_flag=True,
                  default=False,
@ -513,6 +534,13 @@ class GPQADiamond(LmEvalEvaluator):
                  is_flag=True,
                  default=False,
                  help="Whether to apply chat template.")
+    @click.option(
+        "--chat_template_kwargs",
+        type=str,
+        default=None,
+        callback=lambda ctx, param, value: json.loads(value) if value else None,
+        help=
+        'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
    @click.option("--system_prompt",
                  type=str,
                  default=None,
@ -556,6 +584,13 @@ class GPQAMain(LmEvalEvaluator):
                  is_flag=True,
                  default=False,
                  help="Whether to apply chat template.")
+    @click.option(
+        "--chat_template_kwargs",
+        type=str,
+        default=None,
+        callback=lambda ctx, param, value: json.loads(value) if value else None,
+        help=
+        'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
    @click.option("--system_prompt",
                  type=str,
                  default=None,
@ -599,6 +634,13 @@ class GPQAExtended(LmEvalEvaluator):
                  is_flag=True,
                  default=False,
                  help="Whether to apply chat template.")
+    @click.option(
+        "--chat_template_kwargs",
+        type=str,
+        default=None,
+        callback=lambda ctx, param, value: json.loads(value) if value else None,
+        help=
+        'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
    @click.option("--system_prompt",
                  type=str,
                  default=None,
@ -638,6 +680,13 @@ class MMMU(LmEvalEvaluator):
                  type=int,
                  default=0,
                  help="Random seed for dataset processing.")
+    @click.option(
+        "--chat_template_kwargs",
+        type=str,
+        default=None,
+        callback=lambda ctx, param, value: json.loads(value) if value else None,
+        help=
+        'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
    @click.option(
        "--system_prompt",
        type=str,
--- a/tensorrt_llm/evaluate/mmlu.py
+++ b/tensorrt_llm/evaluate/mmlu.py
@ -21,6 +21,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

+import json
 # Not a contribution
 # Changes made by NVIDIA CORPORATION & AFFILIATES or otherwise documented as
 # NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
@ -34,7 +35,7 @@
 # without an express license agreement from NVIDIA CORPORATION or
 # its affiliates is strictly prohibited.
 import math
-from typing import Iterable, List, Optional, Union
+from typing import Any, Iterable, List, Optional, Union

 import click
 import numpy as np
@ -137,10 +138,12 @@ class MMLU(Evaluator):
                 num_fewshot: int = 5,
                 random_seed: int = 0,
                 apply_chat_template: bool = False,
-                 system_prompt: Optional[str] = None):
+                 system_prompt: Optional[str] = None,
+                 chat_template_kwargs: Optional[dict[str, Any]] = None):
        super().__init__(random_seed=random_seed,
                         apply_chat_template=apply_chat_template,
-                         system_prompt=system_prompt)
+                         system_prompt=system_prompt,
+                         chat_template_kwargs=chat_template_kwargs)
        if dataset_path is None:
            dataset_path = self.dowload_dataset()
        self.dataset_path = dataset_path
@ -296,6 +299,13 @@ class MMLU(Evaluator):
                  is_flag=True,
                  default=False,
                  help="Whether to apply chat template.")
+    @click.option(
+        "--chat_template_kwargs",
+        type=str,
+        default=None,
+        callback=lambda ctx, param, value: json.loads(value) if value else None,
+        help=
+        'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
    @click.option("--system_prompt",
                  type=str,
                  default=None,
@ -314,6 +324,7 @@ class MMLU(Evaluator):
    @staticmethod
    def command(ctx, dataset_path: Optional[str], num_samples: int,
                num_fewshot: int, random_seed: int, apply_chat_template: bool,
+                chat_template_kwargs: Optional[dict[str, Any]],
                system_prompt: Optional[str], max_input_length: int,
                max_output_length: int, check_accuracy: bool,
                accuracy_threshold: float) -> None:
@ -326,7 +337,8 @@ class MMLU(Evaluator):
                         num_fewshot=num_fewshot,
                         random_seed=random_seed,
                         apply_chat_template=apply_chat_template,
-                         system_prompt=system_prompt)
+                         system_prompt=system_prompt,
+                         chat_template_kwargs=chat_template_kwargs)
        accuracy = evaluator.evaluate(llm, sampling_params)
        llm.shutdown()

--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@ -221,3 +221,5 @@ GPT-OSS/MXFP4:
    accuracy: 90.3
 LGAI-EXAONE/EXAONE-4.0-32B:
  - accuracy: 88.36
+ByteDance-Seed/Seed-OSS-36B-Instruct:
+  - accuracy: 90.8
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -3656,3 +3656,26 @@ class TestNano_V2_VLM(LlmapiAccuracyTestHarness):
                 kv_cache_config=self.kv_cache_config) as llm:
            task = MMMU(self.MODEL_NAME)
            task.evaluate(llm, sampling_params=self.sampling_params)
+
+
+class TestSeedOss_36B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "ByteDance-Seed/Seed-OSS-36B-Instruct"
+    MODEL_PATH = f"{llm_models_root()}/Seed-OSS/Seed-OSS-36B-Instruct"
+
+    gsm8k_sampling_params = SamplingParams(temperature=1.1,
+                                           top_p=0.95,
+                                           max_tokens=16384)
+
+    @skip_pre_hopper
+    @pytest.mark.skip_less_device_memory(140000)
+    def test_auto_dtype(self):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
+        chat_template_kwargs = dict(thinking_budget=-1)
+
+        with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm,
+                          sampling_params=self.gsm8k_sampling_params,
+                          extra_evaluator_kwargs=dict(
+                              apply_chat_template=True,
+                              chat_template_kwargs=chat_template_kwargs))
--- a/tests/integration/test_lists/qa/llm_function_core.txt
+++ b/tests/integration/test_lists/qa/llm_function_core.txt
@ -609,6 +609,7 @@ accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
 accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2
+accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype

 test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
 test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
--- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt
+++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt
@ -189,6 +189,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-laten
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
+accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@ -146,3 +146,4 @@ l0_b200:
  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype