From 287f6c2e0f1ae7f28b85904059b53180ce25e91f Mon Sep 17 00:00:00 2001 From: Zhenhuan Chen Date: Wed, 14 Jan 2026 16:01:38 +0800 Subject: [PATCH] [None][test] add log_samples and output_path for trtllm_eval (#10629) Signed-off-by: Zhenhuan Chen --- requirements-dev.txt | 2 +- tensorrt_llm/evaluate/lm_eval.py | 88 +++++++++++++++++++++++++++++--- 2 files changed, 83 insertions(+), 7 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 7b3b8cbcab..4ff771f955 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -28,7 +28,7 @@ jieba==0.42.1 rouge==1.0.1 pytest-rerunfailures ruff==0.9.4 -lm_eval[api]==0.4.9.2 +lm_eval[api]==0.4.9.1 # 0.4.9.2 bug for custom config: https://github.com/EleutherAI/lm-evaluation-harness/pull/3436 docstring_parser genai-perf==0.0.13 opentelemetry-sdk>=1.26.0 diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py index 4a877d75f4..ab51b8feec 100644 --- a/tensorrt_llm/evaluate/lm_eval.py +++ b/tensorrt_llm/evaluate/lm_eval.py @@ -16,6 +16,7 @@ import copy import json import os from contextlib import contextmanager +from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import click @@ -334,7 +335,9 @@ class LmEvalEvaluator(Evaluator): fewshot_as_multiturn: bool = False, system_prompt: Optional[str] = None, is_multimodal: bool = False, - chat_template_kwargs: Optional[dict[str, Any]] = None): + chat_template_kwargs: Optional[dict[str, Any]] = None, + log_samples: bool = False, + output_path: Optional[str] = None): try: import lm_eval except ImportError as e: @@ -357,6 +360,8 @@ class LmEvalEvaluator(Evaluator): self.task_name = task_name self.dataset_path = dataset_path self.num_samples = num_samples + self.log_samples = log_samples + self.output_path = output_path task_manager = TaskManager( include_path=f"{os.path.dirname(__file__)}/lm_eval_tasks") @@ -436,6 +441,14 @@ class LmEvalEvaluator(Evaluator): *auxiliaries) -> float: raise NotImplementedError() + def save_results(self, results: dict) -> None: + path = Path(self.output_path) + path.mkdir(parents=True, exist_ok=True) + result_path = (path / f"samples_{self.task_name}.json") + with open(result_path, "w") as f: + json.dump(results, f, indent=2) + logger.info(f"Results saved to {result_path}") + def evaluate(self, llm: Union[LLM, PyTorchLLM], sampling_params: Optional[SamplingParams] = None, @@ -456,7 +469,9 @@ class LmEvalEvaluator(Evaluator): limit=self.num_samples, apply_chat_template=self.apply_chat_template, fewshot_as_multiturn=self.fewshot_as_multiturn, - system_instruction=self.system_prompt) + system_instruction=self.system_prompt, + log_samples=self.log_samples) + # Normalize scores to range 0~100 scores = results["results"][self.task_name] for metric in scores.keys(): @@ -465,6 +480,11 @@ class LmEvalEvaluator(Evaluator): logger.info( f"lm-eval {self.task_name} results (scores normalized to range 0~100):\n{lm_eval.utils.make_table(results)}" ) + + # Save results if output_path is specified + if self.output_path: + self.save_results(results) + if scores_filter is not None: result_acc = results["results"][self.task_name][scores_filter] logger.info( @@ -491,7 +511,9 @@ class LmEvalEvaluator(Evaluator): system_prompt=kwargs.pop("system_prompt", None), is_multimodal=kwargs.pop("is_multimodal", False), chat_template_kwargs=kwargs.pop("chat_template_kwargs", - None)) + None), + log_samples=kwargs.pop("log_samples", False), + output_path=kwargs.pop("output_path", None)) sampling_params = SamplingParams( max_tokens=kwargs.pop("max_output_length"), truncate_prompt_tokens=kwargs.pop("max_input_length"), @@ -548,6 +570,14 @@ class GSM8K(LmEvalEvaluator): type=int, default=256, help="Maximum generation length.") + @click.option("--log_samples", + is_flag=True, + default=False, + help="Log sample outputs for debugging.") + @click.option("--output_path", + type=str, + default=None, + help="Path to save evaluation results.") @click.pass_context @staticmethod def command(ctx, **kwargs) -> None: @@ -602,6 +632,14 @@ class GPQADiamond(LmEvalEvaluator): type=int, default=32768, help="Maximum generation length.") + @click.option("--log_samples", + is_flag=True, + default=False, + help="Log sample outputs for debugging.") + @click.option("--output_path", + type=str, + default=None, + help="Path to save evaluation results.") @click.pass_context @staticmethod def command(ctx, **kwargs) -> None: @@ -652,6 +690,14 @@ class GPQAMain(LmEvalEvaluator): type=int, default=32768, help="Maximum generation length.") + @click.option("--log_samples", + is_flag=True, + default=False, + help="Log sample outputs for debugging.") + @click.option("--output_path", + type=str, + default=None, + help="Path to save evaluation results.") @click.pass_context @staticmethod def command(ctx, **kwargs) -> None: @@ -702,6 +748,14 @@ class GPQAExtended(LmEvalEvaluator): type=int, default=32768, help="Maximum generation length.") + @click.option("--log_samples", + is_flag=True, + default=False, + help="Log sample outputs for debugging.") + @click.option("--output_path", + type=str, + default=None, + help="Path to save evaluation results.") @click.pass_context @staticmethod def command(ctx, **kwargs) -> None: @@ -753,6 +807,14 @@ class MMMU(LmEvalEvaluator): default= 512, # NOTE: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmmu/_template_yaml#L13 help="Maximum generation length.") + @click.option("--log_samples", + is_flag=True, + default=False, + help="Log sample outputs for debugging.") + @click.option("--output_path", + type=str, + default=None, + help="Path to save evaluation results.") @click.pass_context @staticmethod def command(ctx, **kwargs) -> None: @@ -823,12 +885,16 @@ class LongBenchV1(LmEvalEvaluator): limit=self.num_samples, apply_chat_template=self.apply_chat_template, fewshot_as_multiturn=self.fewshot_as_multiturn, - system_instruction=self.system_prompt) - + system_instruction=self.system_prompt, + log_samples=self.log_samples) logger.info( f"lm-eval {self.task_name} results:\n{lm_eval.utils.make_table(results)}" ) + # Save results if output_path is specified + if self.output_path: + self.save_results(results) + # LongBench is a group task in lm-eval. lm-eval already computes subgroup # "score" values (e.g., `longbench_fewshot`, `longbench_single`, ...). # To keep this implementation simple and aligned with the printed table, @@ -897,6 +963,14 @@ class LongBenchV1(LmEvalEvaluator): type=str, default=None, help="System prompt.") + @click.option("--log_samples", + is_flag=True, + default=False, + help="Log sample outputs for debugging.") + @click.option("--output_path", + type=str, + default=None, + help="Path to save evaluation results.") @click.pass_context @staticmethod def command(ctx, **kwargs) -> None: @@ -908,7 +982,9 @@ class LongBenchV1(LmEvalEvaluator): random_seed=kwargs.pop("random_seed", 0), apply_chat_template=kwargs.pop("apply_chat_template", True), system_prompt=kwargs.pop("system_prompt", None), - chat_template_kwargs=kwargs.pop("chat_template_kwargs", None)) + chat_template_kwargs=kwargs.pop("chat_template_kwargs", None), + log_samples=kwargs.pop("log_samples", False), + output_path=kwargs.pop("output_path", None)) # Let lm-eval task configs control sampling via gen_kwargs. sampling_params = None