From 287f6c2e0f1ae7f28b85904059b53180ce25e91f Mon Sep 17 00:00:00 2001
From: Zhenhuan Chen <chenzhh3671@gmail.com>
Date: Wed, 14 Jan 2026 16:01:38 +0800
Subject: [PATCH] [None][test] add log_samples and output_path for trtllm_eval
 (#10629)

Signed-off-by: Zhenhuan Chen <zhenhuanc@nvidia.com>
---
 requirements-dev.txt             |  2 +-
 tensorrt_llm/evaluate/lm_eval.py | 88 +++++++++++++++++++++++++++++---
 2 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 7b3b8cbcab..4ff771f955 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -28,7 +28,7 @@ jieba==0.42.1
 rouge==1.0.1
 pytest-rerunfailures
 ruff==0.9.4
-lm_eval[api]==0.4.9.2
+lm_eval[api]==0.4.9.1  # 0.4.9.2 bug for custom config: https://github.com/EleutherAI/lm-evaluation-harness/pull/3436
 docstring_parser
 genai-perf==0.0.13
 opentelemetry-sdk>=1.26.0
diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py
index 4a877d75f4..ab51b8feec 100644
--- a/tensorrt_llm/evaluate/lm_eval.py
+++ b/tensorrt_llm/evaluate/lm_eval.py
@@ -16,6 +16,7 @@ import copy
 import json
 import os
 from contextlib import contextmanager
+from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import click
@@ -334,7 +335,9 @@ class LmEvalEvaluator(Evaluator):
                  fewshot_as_multiturn: bool = False,
                  system_prompt: Optional[str] = None,
                  is_multimodal: bool = False,
-                 chat_template_kwargs: Optional[dict[str, Any]] = None):
+                 chat_template_kwargs: Optional[dict[str, Any]] = None,
+                 log_samples: bool = False,
+                 output_path: Optional[str] = None):
         try:
             import lm_eval
         except ImportError as e:
@@ -357,6 +360,8 @@ class LmEvalEvaluator(Evaluator):
         self.task_name = task_name
         self.dataset_path = dataset_path
         self.num_samples = num_samples
+        self.log_samples = log_samples
+        self.output_path = output_path
 
         task_manager = TaskManager(
             include_path=f"{os.path.dirname(__file__)}/lm_eval_tasks")
@@ -436,6 +441,14 @@ class LmEvalEvaluator(Evaluator):
                       *auxiliaries) -> float:
         raise NotImplementedError()
 
+    def save_results(self, results: dict) -> None:
+        path = Path(self.output_path)
+        path.mkdir(parents=True, exist_ok=True)
+        result_path = (path / f"samples_{self.task_name}.json")
+        with open(result_path, "w") as f:
+            json.dump(results, f, indent=2)
+        logger.info(f"Results saved to {result_path}")
+
     def evaluate(self,
                  llm: Union[LLM, PyTorchLLM],
                  sampling_params: Optional[SamplingParams] = None,
@@ -456,7 +469,9 @@ class LmEvalEvaluator(Evaluator):
             limit=self.num_samples,
             apply_chat_template=self.apply_chat_template,
             fewshot_as_multiturn=self.fewshot_as_multiturn,
-            system_instruction=self.system_prompt)
+            system_instruction=self.system_prompt,
+            log_samples=self.log_samples)
+
         # Normalize scores to range 0~100
         scores = results["results"][self.task_name]
         for metric in scores.keys():
@@ -465,6 +480,11 @@ class LmEvalEvaluator(Evaluator):
         logger.info(
             f"lm-eval {self.task_name} results (scores normalized to range 0~100):\n{lm_eval.utils.make_table(results)}"
         )
+
+        # Save results if output_path is specified
+        if self.output_path:
+            self.save_results(results)
+
         if scores_filter is not None:
             result_acc = results["results"][self.task_name][scores_filter]
             logger.info(
@@ -491,7 +511,9 @@ class LmEvalEvaluator(Evaluator):
                         system_prompt=kwargs.pop("system_prompt", None),
                         is_multimodal=kwargs.pop("is_multimodal", False),
                         chat_template_kwargs=kwargs.pop("chat_template_kwargs",
-                                                        None))
+                                                        None),
+                        log_samples=kwargs.pop("log_samples", False),
+                        output_path=kwargs.pop("output_path", None))
         sampling_params = SamplingParams(
             max_tokens=kwargs.pop("max_output_length"),
             truncate_prompt_tokens=kwargs.pop("max_input_length"),
@@ -548,6 +570,14 @@ class GSM8K(LmEvalEvaluator):
                   type=int,
                   default=256,
                   help="Maximum generation length.")
+    @click.option("--log_samples",
+                  is_flag=True,
+                  default=False,
+                  help="Log sample outputs for debugging.")
+    @click.option("--output_path",
+                  type=str,
+                  default=None,
+                  help="Path to save evaluation results.")
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
@@ -602,6 +632,14 @@ class GPQADiamond(LmEvalEvaluator):
                   type=int,
                   default=32768,
                   help="Maximum generation length.")
+    @click.option("--log_samples",
+                  is_flag=True,
+                  default=False,
+                  help="Log sample outputs for debugging.")
+    @click.option("--output_path",
+                  type=str,
+                  default=None,
+                  help="Path to save evaluation results.")
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
@@ -652,6 +690,14 @@ class GPQAMain(LmEvalEvaluator):
                   type=int,
                   default=32768,
                   help="Maximum generation length.")
+    @click.option("--log_samples",
+                  is_flag=True,
+                  default=False,
+                  help="Log sample outputs for debugging.")
+    @click.option("--output_path",
+                  type=str,
+                  default=None,
+                  help="Path to save evaluation results.")
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
@@ -702,6 +748,14 @@ class GPQAExtended(LmEvalEvaluator):
                   type=int,
                   default=32768,
                   help="Maximum generation length.")
+    @click.option("--log_samples",
+                  is_flag=True,
+                  default=False,
+                  help="Log sample outputs for debugging.")
+    @click.option("--output_path",
+                  type=str,
+                  default=None,
+                  help="Path to save evaluation results.")
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
@@ -753,6 +807,14 @@ class MMMU(LmEvalEvaluator):
         default=
         512,  # NOTE: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmmu/_template_yaml#L13
         help="Maximum generation length.")
+    @click.option("--log_samples",
+                  is_flag=True,
+                  default=False,
+                  help="Log sample outputs for debugging.")
+    @click.option("--output_path",
+                  type=str,
+                  default=None,
+                  help="Path to save evaluation results.")
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
@@ -823,12 +885,16 @@ class LongBenchV1(LmEvalEvaluator):
             limit=self.num_samples,
             apply_chat_template=self.apply_chat_template,
             fewshot_as_multiturn=self.fewshot_as_multiturn,
-            system_instruction=self.system_prompt)
-
+            system_instruction=self.system_prompt,
+            log_samples=self.log_samples)
         logger.info(
             f"lm-eval {self.task_name} results:\n{lm_eval.utils.make_table(results)}"
         )
 
+        # Save results if output_path is specified
+        if self.output_path:
+            self.save_results(results)
+
         # LongBench is a group task in lm-eval. lm-eval already computes subgroup
         # "score" values (e.g., `longbench_fewshot`, `longbench_single`, ...).
         # To keep this implementation simple and aligned with the printed table,
@@ -897,6 +963,14 @@ class LongBenchV1(LmEvalEvaluator):
                   type=str,
                   default=None,
                   help="System prompt.")
+    @click.option("--log_samples",
+                  is_flag=True,
+                  default=False,
+                  help="Log sample outputs for debugging.")
+    @click.option("--output_path",
+                  type=str,
+                  default=None,
+                  help="Path to save evaluation results.")
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
@@ -908,7 +982,9 @@ class LongBenchV1(LmEvalEvaluator):
             random_seed=kwargs.pop("random_seed", 0),
             apply_chat_template=kwargs.pop("apply_chat_template", True),
             system_prompt=kwargs.pop("system_prompt", None),
-            chat_template_kwargs=kwargs.pop("chat_template_kwargs", None))
+            chat_template_kwargs=kwargs.pop("chat_template_kwargs", None),
+            log_samples=kwargs.pop("log_samples", False),
+            output_path=kwargs.pop("output_path", None))
 
         # Let lm-eval task configs control sampling via gen_kwargs.
         sampling_params = None