[None][test] Add post merge test for Seed-OSS-36B-Instruct (#8321)

Signed-off-by: Zhen Huang <145532724+zhhuang-nv@users.noreply.github.com>
This commit is contained in:
zhhuang-nv 2025-10-17 17:30:33 +08:00 committed by GitHub
parent e72ade33c2
commit 7a2bab93f0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 109 additions and 15 deletions

View File

@ -2503,7 +2503,8 @@ def launchTestJobs(pipeline, testFilter)
// "H100_PCIe-TensorRT-Post-Merge-5": ["h100-cr", "l0_h100", 5, 5],
"H100_PCIe-FMHA-Post-Merge-1": ["h100-cr", "l0_h100", 1, 1],
"B200_PCIe-Triton-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
"B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
"B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2],
"B200_PCIe-PyTorch-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2],
// "B200_PCIe-TensorRT-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2],
// "B200_PCIe-TensorRT-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2],
"H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1],

View File

@ -34,13 +34,15 @@ class Evaluator(ABC):
random_seed: int = 0,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
system_prompt: Optional[str] = None):
system_prompt: Optional[str] = None,
chat_template_kwargs: Optional[dict[str, Any]] = None):
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
self.apply_chat_template = apply_chat_template
self.fewshot_as_multiturn = fewshot_as_multiturn
self.system_prompt = system_prompt
self.chat_template_kwargs = chat_template_kwargs
@abstractmethod
def generate_samples(self) -> Iterable[tuple]:
@ -64,7 +66,9 @@ class Evaluator(ABC):
}] + messages
return llm.tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
add_generation_prompt=True,
**(self.chat_template_kwargs
or {}))
def _get_sampline_params(self, sampling_params: Optional[SamplingParams],
sampling_args: Optional[dict]) -> SamplingParams:

View File

@ -16,7 +16,7 @@ import copy
import json
import os
from contextlib import contextmanager
from typing import Dict, Iterable, List, Optional, Tuple, Union
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
import click
import numpy as np
@ -51,11 +51,13 @@ class LmEvalWrapper(TemplateLM):
def __init__(self,
llm: Union[LLM, PyTorchLLM],
sampling_params: Optional[SamplingParams] = None,
streaming: bool = False):
streaming: bool = False,
chat_template_kwargs: Optional[dict[str, Any]] = None):
super().__init__()
self.llm = llm
self.sampling_params = sampling_params
self.streaming = streaming
self.chat_template_kwargs = chat_template_kwargs
@property
def eot_token_id(self) -> int:
@ -72,6 +74,7 @@ class LmEvalWrapper(TemplateLM):
tokenize=False,
add_generation_prompt=add_generation_prompt,
continue_final_message=not add_generation_prompt,
**(self.chat_template_kwargs or {}),
)
@property
@ -146,7 +149,8 @@ class MultimodalLmEvalWrapper(LmEvalWrapper):
llm: Union[LLM, PyTorchLLM],
sampling_params: Optional[SamplingParams] = None,
streaming: bool = False,
max_images: int = 999):
max_images: int = 999,
chat_template_kwargs: Optional[dict[str, Any]] = None):
"""
Initialize the multimodal wrapper.
@ -161,6 +165,7 @@ class MultimodalLmEvalWrapper(LmEvalWrapper):
# NOTE: Required by lm_eval to identify this as a multimodal model
self.MULTIMODAL = True
self.max_images = max_images
self.chat_template_kwargs = chat_template_kwargs
self.model_type = self._get_model_type(llm)
# NOTE: In TRT-LLM, currently we do not support interleaved text and image. Instead, we are adding image placeholders at the end of the text or at the beginning of the text.
@ -237,7 +242,9 @@ class MultimodalLmEvalWrapper(LmEvalWrapper):
mm_placeholder_counts=mm_placeholder_counts,
tools=None,
chat_template_kwargs={
"continue_final_message": not add_generation_prompt
**(self.chat_template_kwargs or {}),
"continue_final_message":
not add_generation_prompt,
})
return output
@ -301,7 +308,8 @@ class LmEvalEvaluator(Evaluator):
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
system_prompt: Optional[str] = None,
is_multimodal: bool = False):
is_multimodal: bool = False,
chat_template_kwargs: Optional[dict[str, Any]] = None):
try:
import lm_eval
except ImportError as e:
@ -319,7 +327,8 @@ class LmEvalEvaluator(Evaluator):
super().__init__(random_seed=random_seed,
apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
system_prompt=system_prompt)
system_prompt=system_prompt,
chat_template_kwargs=chat_template_kwargs)
self.task_name = task_name
self.dataset_path = dataset_path
self.num_samples = num_samples
@ -390,7 +399,10 @@ class LmEvalEvaluator(Evaluator):
import lm_eval
lm_cls = MultimodalLmEvalWrapper if self.MULTIMODAL else LmEvalWrapper
results = lm_eval.evaluate(
lm=lm_cls(llm, sampling_params, streaming),
lm=lm_cls(llm,
sampling_params=sampling_params,
streaming=streaming,
chat_template_kwargs=self.chat_template_kwargs),
task_dict=self.task_dict,
limit=self.num_samples,
apply_chat_template=self.apply_chat_template,
@ -428,7 +440,9 @@ class LmEvalEvaluator(Evaluator):
fewshot_as_multiturn=kwargs.pop("fewshot_as_multiturn",
False),
system_prompt=kwargs.pop("system_prompt", None),
is_multimodal=kwargs.pop("is_multimodal", False))
is_multimodal=kwargs.pop("is_multimodal", False),
chat_template_kwargs=kwargs.pop("chat_template_kwargs",
None))
sampling_params = SamplingParams(
max_tokens=kwargs.pop("max_output_length"),
truncate_prompt_tokens=kwargs.pop("max_input_length"),
@ -462,6 +476,13 @@ class GSM8K(LmEvalEvaluator):
is_flag=True,
default=False,
help="Whether to apply chat template.")
@click.option(
"--chat_template_kwargs",
type=str,
default=None,
callback=lambda ctx, param, value: json.loads(value) if value else None,
help=
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
@click.option("--fewshot_as_multiturn",
is_flag=True,
default=False,
@ -513,6 +534,13 @@ class GPQADiamond(LmEvalEvaluator):
is_flag=True,
default=False,
help="Whether to apply chat template.")
@click.option(
"--chat_template_kwargs",
type=str,
default=None,
callback=lambda ctx, param, value: json.loads(value) if value else None,
help=
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
@click.option("--system_prompt",
type=str,
default=None,
@ -556,6 +584,13 @@ class GPQAMain(LmEvalEvaluator):
is_flag=True,
default=False,
help="Whether to apply chat template.")
@click.option(
"--chat_template_kwargs",
type=str,
default=None,
callback=lambda ctx, param, value: json.loads(value) if value else None,
help=
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
@click.option("--system_prompt",
type=str,
default=None,
@ -599,6 +634,13 @@ class GPQAExtended(LmEvalEvaluator):
is_flag=True,
default=False,
help="Whether to apply chat template.")
@click.option(
"--chat_template_kwargs",
type=str,
default=None,
callback=lambda ctx, param, value: json.loads(value) if value else None,
help=
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
@click.option("--system_prompt",
type=str,
default=None,
@ -638,6 +680,13 @@ class MMMU(LmEvalEvaluator):
type=int,
default=0,
help="Random seed for dataset processing.")
@click.option(
"--chat_template_kwargs",
type=str,
default=None,
callback=lambda ctx, param, value: json.loads(value) if value else None,
help=
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
@click.option(
"--system_prompt",
type=str,

View File

@ -21,6 +21,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import json
# Not a contribution
# Changes made by NVIDIA CORPORATION & AFFILIATES or otherwise documented as
# NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
@ -34,7 +35,7 @@
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.
import math
from typing import Iterable, List, Optional, Union
from typing import Any, Iterable, List, Optional, Union
import click
import numpy as np
@ -137,10 +138,12 @@ class MMLU(Evaluator):
num_fewshot: int = 5,
random_seed: int = 0,
apply_chat_template: bool = False,
system_prompt: Optional[str] = None):
system_prompt: Optional[str] = None,
chat_template_kwargs: Optional[dict[str, Any]] = None):
super().__init__(random_seed=random_seed,
apply_chat_template=apply_chat_template,
system_prompt=system_prompt)
system_prompt=system_prompt,
chat_template_kwargs=chat_template_kwargs)
if dataset_path is None:
dataset_path = self.dowload_dataset()
self.dataset_path = dataset_path
@ -296,6 +299,13 @@ class MMLU(Evaluator):
is_flag=True,
default=False,
help="Whether to apply chat template.")
@click.option(
"--chat_template_kwargs",
type=str,
default=None,
callback=lambda ctx, param, value: json.loads(value) if value else None,
help=
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
@click.option("--system_prompt",
type=str,
default=None,
@ -314,6 +324,7 @@ class MMLU(Evaluator):
@staticmethod
def command(ctx, dataset_path: Optional[str], num_samples: int,
num_fewshot: int, random_seed: int, apply_chat_template: bool,
chat_template_kwargs: Optional[dict[str, Any]],
system_prompt: Optional[str], max_input_length: int,
max_output_length: int, check_accuracy: bool,
accuracy_threshold: float) -> None:
@ -326,7 +337,8 @@ class MMLU(Evaluator):
num_fewshot=num_fewshot,
random_seed=random_seed,
apply_chat_template=apply_chat_template,
system_prompt=system_prompt)
system_prompt=system_prompt,
chat_template_kwargs=chat_template_kwargs)
accuracy = evaluator.evaluate(llm, sampling_params)
llm.shutdown()

View File

@ -221,3 +221,5 @@ GPT-OSS/MXFP4:
accuracy: 90.3
LGAI-EXAONE/EXAONE-4.0-32B:
- accuracy: 88.36
ByteDance-Seed/Seed-OSS-36B-Instruct:
- accuracy: 90.8

View File

@ -3656,3 +3656,26 @@ class TestNano_V2_VLM(LlmapiAccuracyTestHarness):
kv_cache_config=self.kv_cache_config) as llm:
task = MMMU(self.MODEL_NAME)
task.evaluate(llm, sampling_params=self.sampling_params)
class TestSeedOss_36B(LlmapiAccuracyTestHarness):
MODEL_NAME = "ByteDance-Seed/Seed-OSS-36B-Instruct"
MODEL_PATH = f"{llm_models_root()}/Seed-OSS/Seed-OSS-36B-Instruct"
gsm8k_sampling_params = SamplingParams(temperature=1.1,
top_p=0.95,
max_tokens=16384)
@skip_pre_hopper
@pytest.mark.skip_less_device_memory(140000)
def test_auto_dtype(self):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
chat_template_kwargs = dict(thinking_budget=-1)
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm,
sampling_params=self.gsm8k_sampling_params,
extra_evaluator_kwargs=dict(
apply_chat_template=True,
chat_template_kwargs=chat_template_kwargs))

View File

@ -609,6 +609,7 @@ accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]

View File

@ -189,6 +189,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-laten
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]

View File

@ -146,3 +146,4 @@ l0_b200:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype