Merge b3d0f4a883 into 6df2c8a074

2026-01-13 22:18:36 +08:00 · 2026-01-13 21:15:45 +08:00 · 2026-01-13 21:15:45 +08:00 · 260eacee4b
commit 260eacee4b
parent 6df2c8a074 b3d0f4a883
2 changed files with 83 additions and 7 deletions
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -28,7 +28,7 @@ jieba==0.42.1
 rouge==1.0.1
 pytest-rerunfailures
 ruff==0.9.4
-lm_eval[api]==0.4.9.2
+lm_eval[api]==0.4.9.1  # 0.4.9.2 bug for custom config: https://github.com/EleutherAI/lm-evaluation-harness/pull/3436
 docstring_parser
 genai-perf==0.0.13
 opentelemetry-sdk>=1.26.0
--- a/tensorrt_llm/evaluate/lm_eval.py
+++ b/tensorrt_llm/evaluate/lm_eval.py
@ -16,6 +16,7 @@ import copy
 import json
 import os
 from contextlib import contextmanager
+from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

 import click
@ -334,7 +335,9 @@ class LmEvalEvaluator(Evaluator):
                 fewshot_as_multiturn: bool = False,
                 system_prompt: Optional[str] = None,
                 is_multimodal: bool = False,
-                 chat_template_kwargs: Optional[dict[str, Any]] = None):
+                 chat_template_kwargs: Optional[dict[str, Any]] = None,
+                 log_samples: bool = False,
+                 output_path: Optional[str] = None):
        try:
            import lm_eval
        except ImportError as e:
@ -357,6 +360,8 @@ class LmEvalEvaluator(Evaluator):
        self.task_name = task_name
        self.dataset_path = dataset_path
        self.num_samples = num_samples
+        self.log_samples = log_samples
+        self.output_path = output_path

        task_manager = TaskManager(
            include_path=f"{os.path.dirname(__file__)}/lm_eval_tasks")
@ -436,6 +441,14 @@ class LmEvalEvaluator(Evaluator):
                      *auxiliaries) -> float:
        raise NotImplementedError()

+    def save_results(self, results: dict) -> None:
+        path = Path(self.output_path)
+        path.mkdir(parents=True, exist_ok=True)
+        result_path = (path / f"samples_{self.task_name}.json")
+        with open(result_path, "w") as f:
+            json.dump(results, f, indent=2)
+        logger.info(f"Results saved to {result_path}")
+
    def evaluate(self,
                 llm: Union[LLM, PyTorchLLM],
                 sampling_params: Optional[SamplingParams] = None,
@ -456,7 +469,9 @@ class LmEvalEvaluator(Evaluator):
            limit=self.num_samples,
            apply_chat_template=self.apply_chat_template,
            fewshot_as_multiturn=self.fewshot_as_multiturn,
-            system_instruction=self.system_prompt)
+            system_instruction=self.system_prompt,
+            log_samples=self.log_samples)
+
        # Normalize scores to range 0~100
        scores = results["results"][self.task_name]
        for metric in scores.keys():
@ -465,6 +480,11 @@ class LmEvalEvaluator(Evaluator):
        logger.info(
            f"lm-eval {self.task_name} results (scores normalized to range 0~100):\n{lm_eval.utils.make_table(results)}"
        )
+
+        # Save results if output_path is specified
+        if self.output_path:
+            self.save_results(results)
+
        if scores_filter is not None:
            result_acc = results["results"][self.task_name][scores_filter]
            logger.info(
@ -491,7 +511,9 @@ class LmEvalEvaluator(Evaluator):
                        system_prompt=kwargs.pop("system_prompt", None),
                        is_multimodal=kwargs.pop("is_multimodal", False),
                        chat_template_kwargs=kwargs.pop("chat_template_kwargs",
-                                                        None))
+                                                        None),
+                        log_samples=kwargs.pop("log_samples", False),
+                        output_path=kwargs.pop("output_path", None))
        sampling_params = SamplingParams(
            max_tokens=kwargs.pop("max_output_length"),
            truncate_prompt_tokens=kwargs.pop("max_input_length"),
@ -548,6 +570,14 @@ class GSM8K(LmEvalEvaluator):
                  type=int,
                  default=256,
                  help="Maximum generation length.")
+    @click.option("--log_samples",
+                  is_flag=True,
+                  default=False,
+                  help="Log sample outputs for debugging.")
+    @click.option("--output_path",
+                  type=str,
+                  default=None,
+                  help="Path to save evaluation results.")
    @click.pass_context
    @staticmethod
    def command(ctx, **kwargs) -> None:
@ -602,6 +632,14 @@ class GPQADiamond(LmEvalEvaluator):
                  type=int,
                  default=32768,
                  help="Maximum generation length.")
+    @click.option("--log_samples",
+                  is_flag=True,
+                  default=False,
+                  help="Log sample outputs for debugging.")
+    @click.option("--output_path",
+                  type=str,
+                  default=None,
+                  help="Path to save evaluation results.")
    @click.pass_context
    @staticmethod
    def command(ctx, **kwargs) -> None:
@ -652,6 +690,14 @@ class GPQAMain(LmEvalEvaluator):
                  type=int,
                  default=32768,
                  help="Maximum generation length.")
+    @click.option("--log_samples",
+                  is_flag=True,
+                  default=False,
+                  help="Log sample outputs for debugging.")
+    @click.option("--output_path",
+                  type=str,
+                  default=None,
+                  help="Path to save evaluation results.")
    @click.pass_context
    @staticmethod
    def command(ctx, **kwargs) -> None:
@ -702,6 +748,14 @@ class GPQAExtended(LmEvalEvaluator):
                  type=int,
                  default=32768,
                  help="Maximum generation length.")
+    @click.option("--log_samples",
+                  is_flag=True,
+                  default=False,
+                  help="Log sample outputs for debugging.")
+    @click.option("--output_path",
+                  type=str,
+                  default=None,
+                  help="Path to save evaluation results.")
    @click.pass_context
    @staticmethod
    def command(ctx, **kwargs) -> None:
@ -753,6 +807,14 @@ class MMMU(LmEvalEvaluator):
        default=
        512,  # NOTE: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmmu/_template_yaml#L13
        help="Maximum generation length.")
+    @click.option("--log_samples",
+                  is_flag=True,
+                  default=False,
+                  help="Log sample outputs for debugging.")
+    @click.option("--output_path",
+                  type=str,
+                  default=None,
+                  help="Path to save evaluation results.")
    @click.pass_context
    @staticmethod
    def command(ctx, **kwargs) -> None:
@ -823,12 +885,16 @@ class LongBenchV1(LmEvalEvaluator):
            limit=self.num_samples,
            apply_chat_template=self.apply_chat_template,
            fewshot_as_multiturn=self.fewshot_as_multiturn,
-            system_instruction=self.system_prompt)
-
+            system_instruction=self.system_prompt,
+            log_samples=self.log_samples)
        logger.info(
            f"lm-eval {self.task_name} results:\n{lm_eval.utils.make_table(results)}"
        )

+        # Save results if output_path is specified
+        if self.output_path:
+            self.save_results(results)
+
        # LongBench is a group task in lm-eval. lm-eval already computes subgroup
        # "score" values (e.g., `longbench_fewshot`, `longbench_single`, ...).
        # To keep this implementation simple and aligned with the printed table,
@ -897,6 +963,14 @@ class LongBenchV1(LmEvalEvaluator):
                  type=str,
                  default=None,
                  help="System prompt.")
+    @click.option("--log_samples",
+                  is_flag=True,
+                  default=False,
+                  help="Log sample outputs for debugging.")
+    @click.option("--output_path",
+                  type=str,
+                  default=None,
+                  help="Path to save evaluation results.")
    @click.pass_context
    @staticmethod
    def command(ctx, **kwargs) -> None:
@ -908,7 +982,9 @@ class LongBenchV1(LmEvalEvaluator):
            random_seed=kwargs.pop("random_seed", 0),
            apply_chat_template=kwargs.pop("apply_chat_template", True),
            system_prompt=kwargs.pop("system_prompt", None),
-            chat_template_kwargs=kwargs.pop("chat_template_kwargs", None))
+            chat_template_kwargs=kwargs.pop("chat_template_kwargs", None),
+            log_samples=kwargs.pop("log_samples", False),
+            output_path=kwargs.pop("output_path", None))

        # Let lm-eval task configs control sampling via gen_kwargs.
        sampling_params = None