mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][test] Add post merge test for Seed-OSS-36B-Instruct (#8321)
Signed-off-by: Zhen Huang <145532724+zhhuang-nv@users.noreply.github.com>
This commit is contained in:
parent
e72ade33c2
commit
7a2bab93f0
@ -2503,7 +2503,8 @@ def launchTestJobs(pipeline, testFilter)
|
||||
// "H100_PCIe-TensorRT-Post-Merge-5": ["h100-cr", "l0_h100", 5, 5],
|
||||
"H100_PCIe-FMHA-Post-Merge-1": ["h100-cr", "l0_h100", 1, 1],
|
||||
"B200_PCIe-Triton-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
|
||||
"B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
|
||||
"B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2],
|
||||
"B200_PCIe-PyTorch-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2],
|
||||
// "B200_PCIe-TensorRT-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2],
|
||||
// "B200_PCIe-TensorRT-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2],
|
||||
"H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1],
|
||||
|
||||
@ -34,13 +34,15 @@ class Evaluator(ABC):
|
||||
random_seed: int = 0,
|
||||
apply_chat_template: bool = False,
|
||||
fewshot_as_multiturn: bool = False,
|
||||
system_prompt: Optional[str] = None):
|
||||
system_prompt: Optional[str] = None,
|
||||
chat_template_kwargs: Optional[dict[str, Any]] = None):
|
||||
random.seed(random_seed)
|
||||
np.random.seed(random_seed)
|
||||
torch.manual_seed(random_seed)
|
||||
self.apply_chat_template = apply_chat_template
|
||||
self.fewshot_as_multiturn = fewshot_as_multiturn
|
||||
self.system_prompt = system_prompt
|
||||
self.chat_template_kwargs = chat_template_kwargs
|
||||
|
||||
@abstractmethod
|
||||
def generate_samples(self) -> Iterable[tuple]:
|
||||
@ -64,7 +66,9 @@ class Evaluator(ABC):
|
||||
}] + messages
|
||||
return llm.tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
add_generation_prompt=True,
|
||||
**(self.chat_template_kwargs
|
||||
or {}))
|
||||
|
||||
def _get_sampline_params(self, sampling_params: Optional[SamplingParams],
|
||||
sampling_args: Optional[dict]) -> SamplingParams:
|
||||
|
||||
@ -16,7 +16,7 @@ import copy
|
||||
import json
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import click
|
||||
import numpy as np
|
||||
@ -51,11 +51,13 @@ class LmEvalWrapper(TemplateLM):
|
||||
def __init__(self,
|
||||
llm: Union[LLM, PyTorchLLM],
|
||||
sampling_params: Optional[SamplingParams] = None,
|
||||
streaming: bool = False):
|
||||
streaming: bool = False,
|
||||
chat_template_kwargs: Optional[dict[str, Any]] = None):
|
||||
super().__init__()
|
||||
self.llm = llm
|
||||
self.sampling_params = sampling_params
|
||||
self.streaming = streaming
|
||||
self.chat_template_kwargs = chat_template_kwargs
|
||||
|
||||
@property
|
||||
def eot_token_id(self) -> int:
|
||||
@ -72,6 +74,7 @@ class LmEvalWrapper(TemplateLM):
|
||||
tokenize=False,
|
||||
add_generation_prompt=add_generation_prompt,
|
||||
continue_final_message=not add_generation_prompt,
|
||||
**(self.chat_template_kwargs or {}),
|
||||
)
|
||||
|
||||
@property
|
||||
@ -146,7 +149,8 @@ class MultimodalLmEvalWrapper(LmEvalWrapper):
|
||||
llm: Union[LLM, PyTorchLLM],
|
||||
sampling_params: Optional[SamplingParams] = None,
|
||||
streaming: bool = False,
|
||||
max_images: int = 999):
|
||||
max_images: int = 999,
|
||||
chat_template_kwargs: Optional[dict[str, Any]] = None):
|
||||
"""
|
||||
Initialize the multimodal wrapper.
|
||||
|
||||
@ -161,6 +165,7 @@ class MultimodalLmEvalWrapper(LmEvalWrapper):
|
||||
# NOTE: Required by lm_eval to identify this as a multimodal model
|
||||
self.MULTIMODAL = True
|
||||
self.max_images = max_images
|
||||
self.chat_template_kwargs = chat_template_kwargs
|
||||
self.model_type = self._get_model_type(llm)
|
||||
|
||||
# NOTE: In TRT-LLM, currently we do not support interleaved text and image. Instead, we are adding image placeholders at the end of the text or at the beginning of the text.
|
||||
@ -237,7 +242,9 @@ class MultimodalLmEvalWrapper(LmEvalWrapper):
|
||||
mm_placeholder_counts=mm_placeholder_counts,
|
||||
tools=None,
|
||||
chat_template_kwargs={
|
||||
"continue_final_message": not add_generation_prompt
|
||||
**(self.chat_template_kwargs or {}),
|
||||
"continue_final_message":
|
||||
not add_generation_prompt,
|
||||
})
|
||||
return output
|
||||
|
||||
@ -301,7 +308,8 @@ class LmEvalEvaluator(Evaluator):
|
||||
apply_chat_template: bool = False,
|
||||
fewshot_as_multiturn: bool = False,
|
||||
system_prompt: Optional[str] = None,
|
||||
is_multimodal: bool = False):
|
||||
is_multimodal: bool = False,
|
||||
chat_template_kwargs: Optional[dict[str, Any]] = None):
|
||||
try:
|
||||
import lm_eval
|
||||
except ImportError as e:
|
||||
@ -319,7 +327,8 @@ class LmEvalEvaluator(Evaluator):
|
||||
super().__init__(random_seed=random_seed,
|
||||
apply_chat_template=apply_chat_template,
|
||||
fewshot_as_multiturn=fewshot_as_multiturn,
|
||||
system_prompt=system_prompt)
|
||||
system_prompt=system_prompt,
|
||||
chat_template_kwargs=chat_template_kwargs)
|
||||
self.task_name = task_name
|
||||
self.dataset_path = dataset_path
|
||||
self.num_samples = num_samples
|
||||
@ -390,7 +399,10 @@ class LmEvalEvaluator(Evaluator):
|
||||
import lm_eval
|
||||
lm_cls = MultimodalLmEvalWrapper if self.MULTIMODAL else LmEvalWrapper
|
||||
results = lm_eval.evaluate(
|
||||
lm=lm_cls(llm, sampling_params, streaming),
|
||||
lm=lm_cls(llm,
|
||||
sampling_params=sampling_params,
|
||||
streaming=streaming,
|
||||
chat_template_kwargs=self.chat_template_kwargs),
|
||||
task_dict=self.task_dict,
|
||||
limit=self.num_samples,
|
||||
apply_chat_template=self.apply_chat_template,
|
||||
@ -428,7 +440,9 @@ class LmEvalEvaluator(Evaluator):
|
||||
fewshot_as_multiturn=kwargs.pop("fewshot_as_multiturn",
|
||||
False),
|
||||
system_prompt=kwargs.pop("system_prompt", None),
|
||||
is_multimodal=kwargs.pop("is_multimodal", False))
|
||||
is_multimodal=kwargs.pop("is_multimodal", False),
|
||||
chat_template_kwargs=kwargs.pop("chat_template_kwargs",
|
||||
None))
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=kwargs.pop("max_output_length"),
|
||||
truncate_prompt_tokens=kwargs.pop("max_input_length"),
|
||||
@ -462,6 +476,13 @@ class GSM8K(LmEvalEvaluator):
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Whether to apply chat template.")
|
||||
@click.option(
|
||||
"--chat_template_kwargs",
|
||||
type=str,
|
||||
default=None,
|
||||
callback=lambda ctx, param, value: json.loads(value) if value else None,
|
||||
help=
|
||||
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
|
||||
@click.option("--fewshot_as_multiturn",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
@ -513,6 +534,13 @@ class GPQADiamond(LmEvalEvaluator):
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Whether to apply chat template.")
|
||||
@click.option(
|
||||
"--chat_template_kwargs",
|
||||
type=str,
|
||||
default=None,
|
||||
callback=lambda ctx, param, value: json.loads(value) if value else None,
|
||||
help=
|
||||
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
|
||||
@click.option("--system_prompt",
|
||||
type=str,
|
||||
default=None,
|
||||
@ -556,6 +584,13 @@ class GPQAMain(LmEvalEvaluator):
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Whether to apply chat template.")
|
||||
@click.option(
|
||||
"--chat_template_kwargs",
|
||||
type=str,
|
||||
default=None,
|
||||
callback=lambda ctx, param, value: json.loads(value) if value else None,
|
||||
help=
|
||||
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
|
||||
@click.option("--system_prompt",
|
||||
type=str,
|
||||
default=None,
|
||||
@ -599,6 +634,13 @@ class GPQAExtended(LmEvalEvaluator):
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Whether to apply chat template.")
|
||||
@click.option(
|
||||
"--chat_template_kwargs",
|
||||
type=str,
|
||||
default=None,
|
||||
callback=lambda ctx, param, value: json.loads(value) if value else None,
|
||||
help=
|
||||
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
|
||||
@click.option("--system_prompt",
|
||||
type=str,
|
||||
default=None,
|
||||
@ -638,6 +680,13 @@ class MMMU(LmEvalEvaluator):
|
||||
type=int,
|
||||
default=0,
|
||||
help="Random seed for dataset processing.")
|
||||
@click.option(
|
||||
"--chat_template_kwargs",
|
||||
type=str,
|
||||
default=None,
|
||||
callback=lambda ctx, param, value: json.loads(value) if value else None,
|
||||
help=
|
||||
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
|
||||
@click.option(
|
||||
"--system_prompt",
|
||||
type=str,
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
import json
|
||||
# Not a contribution
|
||||
# Changes made by NVIDIA CORPORATION & AFFILIATES or otherwise documented as
|
||||
# NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
|
||||
@ -34,7 +35,7 @@
|
||||
# without an express license agreement from NVIDIA CORPORATION or
|
||||
# its affiliates is strictly prohibited.
|
||||
import math
|
||||
from typing import Iterable, List, Optional, Union
|
||||
from typing import Any, Iterable, List, Optional, Union
|
||||
|
||||
import click
|
||||
import numpy as np
|
||||
@ -137,10 +138,12 @@ class MMLU(Evaluator):
|
||||
num_fewshot: int = 5,
|
||||
random_seed: int = 0,
|
||||
apply_chat_template: bool = False,
|
||||
system_prompt: Optional[str] = None):
|
||||
system_prompt: Optional[str] = None,
|
||||
chat_template_kwargs: Optional[dict[str, Any]] = None):
|
||||
super().__init__(random_seed=random_seed,
|
||||
apply_chat_template=apply_chat_template,
|
||||
system_prompt=system_prompt)
|
||||
system_prompt=system_prompt,
|
||||
chat_template_kwargs=chat_template_kwargs)
|
||||
if dataset_path is None:
|
||||
dataset_path = self.dowload_dataset()
|
||||
self.dataset_path = dataset_path
|
||||
@ -296,6 +299,13 @@ class MMLU(Evaluator):
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Whether to apply chat template.")
|
||||
@click.option(
|
||||
"--chat_template_kwargs",
|
||||
type=str,
|
||||
default=None,
|
||||
callback=lambda ctx, param, value: json.loads(value) if value else None,
|
||||
help=
|
||||
'Chat template kwargs as JSON string, e.g., \'{"thinking_budget": 0}\'')
|
||||
@click.option("--system_prompt",
|
||||
type=str,
|
||||
default=None,
|
||||
@ -314,6 +324,7 @@ class MMLU(Evaluator):
|
||||
@staticmethod
|
||||
def command(ctx, dataset_path: Optional[str], num_samples: int,
|
||||
num_fewshot: int, random_seed: int, apply_chat_template: bool,
|
||||
chat_template_kwargs: Optional[dict[str, Any]],
|
||||
system_prompt: Optional[str], max_input_length: int,
|
||||
max_output_length: int, check_accuracy: bool,
|
||||
accuracy_threshold: float) -> None:
|
||||
@ -326,7 +337,8 @@ class MMLU(Evaluator):
|
||||
num_fewshot=num_fewshot,
|
||||
random_seed=random_seed,
|
||||
apply_chat_template=apply_chat_template,
|
||||
system_prompt=system_prompt)
|
||||
system_prompt=system_prompt,
|
||||
chat_template_kwargs=chat_template_kwargs)
|
||||
accuracy = evaluator.evaluate(llm, sampling_params)
|
||||
llm.shutdown()
|
||||
|
||||
|
||||
@ -221,3 +221,5 @@ GPT-OSS/MXFP4:
|
||||
accuracy: 90.3
|
||||
LGAI-EXAONE/EXAONE-4.0-32B:
|
||||
- accuracy: 88.36
|
||||
ByteDance-Seed/Seed-OSS-36B-Instruct:
|
||||
- accuracy: 90.8
|
||||
|
||||
@ -3656,3 +3656,26 @@ class TestNano_V2_VLM(LlmapiAccuracyTestHarness):
|
||||
kv_cache_config=self.kv_cache_config) as llm:
|
||||
task = MMMU(self.MODEL_NAME)
|
||||
task.evaluate(llm, sampling_params=self.sampling_params)
|
||||
|
||||
|
||||
class TestSeedOss_36B(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "ByteDance-Seed/Seed-OSS-36B-Instruct"
|
||||
MODEL_PATH = f"{llm_models_root()}/Seed-OSS/Seed-OSS-36B-Instruct"
|
||||
|
||||
gsm8k_sampling_params = SamplingParams(temperature=1.1,
|
||||
top_p=0.95,
|
||||
max_tokens=16384)
|
||||
|
||||
@skip_pre_hopper
|
||||
@pytest.mark.skip_less_device_memory(140000)
|
||||
def test_auto_dtype(self):
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
|
||||
chat_template_kwargs = dict(thinking_budget=-1)
|
||||
|
||||
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm,
|
||||
sampling_params=self.gsm8k_sampling_params,
|
||||
extra_evaluator_kwargs=dict(
|
||||
apply_chat_template=True,
|
||||
chat_template_kwargs=chat_template_kwargs))
|
||||
|
||||
@ -609,6 +609,7 @@ accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
|
||||
|
||||
test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
|
||||
test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
|
||||
|
||||
@ -189,6 +189,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-laten
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
|
||||
|
||||
@ -146,3 +146,4 @@ l0_b200:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
|
||||
|
||||
Loading…
Reference in New Issue
Block a user