From dfac07c0456d475a76dc8e9fa1f8f4035daec50c Mon Sep 17 00:00:00 2001
From: heyuhhh <58161490+heyuhhh@users.noreply.github.com>
Date: Thu, 15 Jan 2026 23:27:08 +0800
Subject: [PATCH] [None][feat] Support to export data in trtllm-eval (#10075)

Signed-off-by: yuhangh <58161490+heyuhhh@users.noreply.github.com>
---
 tensorrt_llm/evaluate/cnn_dailymail.py  | 16 ++++--
 tensorrt_llm/evaluate/interface.py      | 68 ++++++++++++++++++++++++-
 tensorrt_llm/evaluate/json_mode_eval.py | 16 ++++--
 tensorrt_llm/evaluate/lm_eval.py        | 61 ++++++++++++++++++----
 tensorrt_llm/evaluate/longbench_v2.py   | 31 ++++++++---
 tensorrt_llm/evaluate/mmlu.py           | 15 ++++--
 6 files changed, 177 insertions(+), 30 deletions(-)

diff --git a/tensorrt_llm/evaluate/cnn_dailymail.py b/tensorrt_llm/evaluate/cnn_dailymail.py
index a5bb14eada..f46cf65cc4 100644
--- a/tensorrt_llm/evaluate/cnn_dailymail.py
+++ b/tensorrt_llm/evaluate/cnn_dailymail.py
@@ -34,10 +34,12 @@ class CnnDailymail(Evaluator):
                  random_seed: int = 0,
                  rouge_path: Optional[str] = None,
                  apply_chat_template: bool = False,
-                 system_prompt: Optional[str] = None):
+                 system_prompt: Optional[str] = None,
+                 output_dir: Optional[str] = None):
         super().__init__(random_seed=random_seed,
                          apply_chat_template=apply_chat_template,
-                         system_prompt=system_prompt)
+                         system_prompt=system_prompt,
+                         output_dir=output_dir)
         if dataset_path is None:
             dataset_path = "ccdv/cnn_dailymail"
         self.data = datasets.load_dataset(dataset_path,
@@ -111,12 +113,17 @@ class CnnDailymail(Evaluator):
                   type=int,
                   default=100,
                   help="Maximum generation length.")
+    @click.option("--output_dir",
+                  type=str,
+                  default=None,
+                  help="Directory to save the task infos.")
     @click.pass_context
     @staticmethod
     def command(ctx, dataset_path: Optional[str], num_samples: int,
                 random_seed: int, rouge_path: Optional[str],
                 apply_chat_template: bool, system_prompt: Optional[str],
-                max_input_length: int, max_output_length: int) -> None:
+                max_input_length: int, max_output_length: int,
+                output_dir: Optional[str]) -> None:
         llm: Union[LLM, PyTorchLLM] = ctx.obj
         sampling_params = SamplingParams(
             max_tokens=max_output_length,
@@ -126,6 +133,7 @@ class CnnDailymail(Evaluator):
                                  random_seed=random_seed,
                                  rouge_path=rouge_path,
                                  apply_chat_template=apply_chat_template,
-                                 system_prompt=system_prompt)
+                                 system_prompt=system_prompt,
+                                 output_dir=output_dir)
         evaluator.evaluate(llm, sampling_params)
         llm.shutdown()
diff --git a/tensorrt_llm/evaluate/interface.py b/tensorrt_llm/evaluate/interface.py
index 4652fa8bcb..7aa1a6b14b 100644
--- a/tensorrt_llm/evaluate/interface.py
+++ b/tensorrt_llm/evaluate/interface.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
+import json
+import os
 import random
 from abc import ABC, abstractmethod
 from typing import Any, Iterable, List, Optional, Union
@@ -35,7 +37,8 @@ class Evaluator(ABC):
                  apply_chat_template: bool = False,
                  fewshot_as_multiturn: bool = False,
                  system_prompt: Optional[str] = None,
-                 chat_template_kwargs: Optional[dict[str, Any]] = None):
+                 chat_template_kwargs: Optional[dict[str, Any]] = None,
+                 output_dir: Optional[str] = None):
         random.seed(random_seed)
         np.random.seed(random_seed)
         torch.manual_seed(random_seed)
@@ -43,6 +46,7 @@ class Evaluator(ABC):
         self.fewshot_as_multiturn = fewshot_as_multiturn
         self.system_prompt = system_prompt
         self.chat_template_kwargs = chat_template_kwargs
+        self.output_dir = output_dir
 
     @abstractmethod
     def generate_samples(self) -> Iterable[tuple]:
@@ -105,6 +109,11 @@ class Evaluator(ABC):
         results = []
         for output in tqdm(outputs, desc="Fetching responses"):
             results.append(output.result())
+
+        if self.output_dir:
+            dump_inference_results(self.output_dir, results,
+                                   getattr(llm, 'tokenizer', None))
+
         profiler.stop("trtllm exec")
         elapsed_time = profiler.elapsed_time_in_sec("trtllm exec")
         logger.info(f"TRTLLM execution time: {elapsed_time:.3f} seconds.")
@@ -116,3 +125,60 @@ class Evaluator(ABC):
     @staticmethod
     def command(ctx, *args, **kwargs) -> None:
         raise NotImplementedError()
+
+
+def dump_inference_results(output_dir: str, results: List[dict],
+                           tokenizer: Any):
+    if not output_dir:
+        return
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Collect results
+    results_list = []
+    for task_id, result in enumerate(results):
+        output_ids = result.outputs[0].token_ids
+        output_text = result.outputs[0].text.strip()
+        input_text = result.prompt.strip()
+        input_ids = tokenizer.encode(input_text)
+        results_list.append({
+            "task_id": task_id,
+            "input_ids": input_ids,
+            "output_ids": output_ids,
+            "input_text": input_text,
+            "output_text": output_text
+        })
+
+    # Dump token ids
+    ids_path = os.path.join(output_dir, "dumped_ids.json")
+    try:
+        with open(ids_path, "w") as f:
+            for item in results_list:
+                data = {
+                    "task_id": item["task_id"],
+                    "input_ids": item["input_ids"],
+                    "output_ids": item["output_ids"],
+                    "input_tokens": len(item["input_ids"]),
+                    "output_tokens": len(item["output_ids"])
+                }
+                f.write(json.dumps(data) + "\n")
+        logger.info(f"Dumped IDs to {ids_path}")
+    except Exception as e:
+        logger.warning(f"Failed to dump IDs to {ids_path}: {e}")
+
+    # Dump text
+    text_path = os.path.join(output_dir, "dumped_text.json")
+    try:
+        with open(text_path, "w") as f:
+            for item in results_list:
+                data = {
+                    "task_id": item["task_id"],
+                    "input_text": item["input_text"],
+                    "output_text": item["output_text"],
+                    "input_len": len(item["input_text"]),
+                    "output_len": len(item["output_text"])
+                }
+                f.write(json.dumps(data) + "\n")
+        logger.info(f"Dumped text to {text_path}")
+    except Exception as e:
+        logger.warning(f"Failed to dump text to {text_path}: {e}")
diff --git a/tensorrt_llm/evaluate/json_mode_eval.py b/tensorrt_llm/evaluate/json_mode_eval.py
index 37360754e5..8e6b0b814e 100644
--- a/tensorrt_llm/evaluate/json_mode_eval.py
+++ b/tensorrt_llm/evaluate/json_mode_eval.py
@@ -36,13 +36,15 @@ class JsonModeEval(Evaluator):
                  num_samples: Optional[int] = None,
                  random_seed: int = 0,
                  apply_chat_template: bool = True,
-                 system_prompt: Optional[str] = None):
+                 system_prompt: Optional[str] = None,
+                 output_dir: Optional[str] = None):
         if not apply_chat_template:
             raise ValueError(
                 f"{self.__class__.__name__} requires apply_chat_template=True.")
         super().__init__(random_seed=random_seed,
                          apply_chat_template=apply_chat_template,
-                         system_prompt=system_prompt)
+                         system_prompt=system_prompt,
+                         output_dir=output_dir)
         if dataset_path is None:
             dataset_path = "NousResearch/json-mode-eval"
         self.data = datasets.load_dataset(dataset_path,
@@ -120,11 +122,16 @@ class JsonModeEval(Evaluator):
                   type=int,
                   default=512,
                   help="Maximum generation length.")
+    @click.option("--output_dir",
+                  type=str,
+                  default=None,
+                  help="Directory to save the task infos.")
     @click.pass_context
     @staticmethod
     def command(ctx, dataset_path: Optional[str], num_samples: int,
                 random_seed: int, system_prompt: Optional[str],
-                max_input_length: int, max_output_length: int) -> None:
+                max_input_length: int, max_output_length: int,
+                output_dir: Optional[str]) -> None:
         llm: Union[LLM, PyTorchLLM] = ctx.obj
         sampling_params = SamplingParams(
             max_tokens=max_output_length,
@@ -133,6 +140,7 @@ class JsonModeEval(Evaluator):
                                  num_samples=num_samples,
                                  random_seed=random_seed,
                                  apply_chat_template=True,
-                                 system_prompt=system_prompt)
+                                 system_prompt=system_prompt,
+                                 output_dir=output_dir)
         evaluator.evaluate(llm, sampling_params)
         llm.shutdown()
diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py
index ab51b8feec..8fbb966075 100644
--- a/tensorrt_llm/evaluate/lm_eval.py
+++ b/tensorrt_llm/evaluate/lm_eval.py
@@ -40,7 +40,7 @@ from ..inputs.utils import apply_chat_template as trtllm_apply_chat_template
 from ..llmapi import RequestOutput
 from ..logger import logger
 from ..sampling_params import SamplingParams
-from .interface import Evaluator
+from .interface import Evaluator, dump_inference_results
 
 # NOTE: lm_eval uses "<image>" as the default image placeholder
 # https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/models/hf_vlms.py#L25
@@ -55,12 +55,14 @@ class LmEvalWrapper(TemplateLM):
                  streaming: bool = False,
                  chat_template_kwargs: Optional[dict[str, Any]] = None,
                  model_type: str | None = None,
-                 is_force_single_image: bool = False):
+                 is_force_single_image: bool = False,
+                 output_dir: Optional[str] = None):
         super().__init__()
         self.llm = llm
         self.sampling_params = sampling_params
         self.streaming = streaming
         self.chat_template_kwargs = chat_template_kwargs
+        self.output_dir = output_dir
 
     @property
     def eot_token_id(self) -> int:
@@ -145,6 +147,10 @@ class LmEvalWrapper(TemplateLM):
                            disable=disable_tqdm):
             outputs.append(output.result())
 
+        if self.output_dir:
+            dump_inference_results(self.output_dir, outputs,
+                                   getattr(self.llm, 'tokenizer', None))
+
         profiler.stop("trtllm exec")
         elapsed_time = profiler.elapsed_time_in_sec("trtllm exec")
         logger.info(f"TRTLLM execution time: {elapsed_time:.3f} seconds.")
@@ -168,7 +174,8 @@ class MultimodalLmEvalWrapper(LmEvalWrapper):
                  max_images: int = 999,
                  chat_template_kwargs: Optional[dict[str, Any]] = None,
                  model_type: str | None = None,
-                 is_force_single_image: bool = False):
+                 is_force_single_image: bool = False,
+                 output_dir: Optional[str] = None):
         """
         Initialize the multimodal wrapper.
 
@@ -177,8 +184,10 @@ class MultimodalLmEvalWrapper(LmEvalWrapper):
             sampling_params: Parameters for text generation
             streaming: Whether to use streaming generation
             max_images: Maximum number of images per prompt (currently unlimited in TRT-LLM), set to 999 from lm_eval's default value.
+            chat_template_kwargs: Chat template kwargs as JSON string
+            output_dir: Directory to save the task infos.
         """
-        super().__init__(llm, sampling_params, streaming)
+        super().__init__(llm, sampling_params, streaming, output_dir=output_dir)
 
         # NOTE: Required by lm_eval to identify this as a multimodal model
         self.MULTIMODAL = True
@@ -316,6 +325,10 @@ class MultimodalLmEvalWrapper(LmEvalWrapper):
                            disable=disable_tqdm):
             outputs.append(output.result())
 
+        if self.output_dir:
+            dump_inference_results(self.output_dir, outputs,
+                                   getattr(self.llm, 'tokenizer', None))
+
         profiler.stop("trtllm exec")
         elapsed_time = profiler.elapsed_time_in_sec("trtllm exec")
         logger.info(f"TRTLLM execution time: {elapsed_time:.3f} seconds.")
@@ -337,7 +350,8 @@ class LmEvalEvaluator(Evaluator):
                  is_multimodal: bool = False,
                  chat_template_kwargs: Optional[dict[str, Any]] = None,
                  log_samples: bool = False,
-                 output_path: Optional[str] = None):
+                 output_path: Optional[str] = None,
+                 output_dir: Optional[str] = None):
         try:
             import lm_eval
         except ImportError as e:
@@ -356,7 +370,8 @@ class LmEvalEvaluator(Evaluator):
                          apply_chat_template=apply_chat_template,
                          fewshot_as_multiturn=fewshot_as_multiturn,
                          system_prompt=system_prompt,
-                         chat_template_kwargs=chat_template_kwargs)
+                         chat_template_kwargs=chat_template_kwargs,
+                         output_dir=output_dir)
         self.task_name = task_name
         self.dataset_path = dataset_path
         self.num_samples = num_samples
@@ -458,13 +473,15 @@ class LmEvalEvaluator(Evaluator):
                  is_force_single_image: bool = False) -> float:
         import lm_eval
         lm_cls = MultimodalLmEvalWrapper if self.MULTIMODAL else LmEvalWrapper
+
         results = lm_eval.evaluate(
             lm=lm_cls(llm,
                       sampling_params=sampling_params,
                       streaming=streaming,
                       chat_template_kwargs=self.chat_template_kwargs,
                       model_type=model_type,
-                      is_force_single_image=is_force_single_image),
+                      is_force_single_image=is_force_single_image,
+                      output_dir=self.output_dir),
             task_dict=self.task_dict,
             limit=self.num_samples,
             apply_chat_template=self.apply_chat_template,
@@ -513,7 +530,8 @@ class LmEvalEvaluator(Evaluator):
                         chat_template_kwargs=kwargs.pop("chat_template_kwargs",
                                                         None),
                         log_samples=kwargs.pop("log_samples", False),
-                        output_path=kwargs.pop("output_path", None))
+                        output_path=kwargs.pop("output_path", None),
+                        output_dir=kwargs.pop("output_dir", None))
         sampling_params = SamplingParams(
             max_tokens=kwargs.pop("max_output_length"),
             truncate_prompt_tokens=kwargs.pop("max_input_length"),
@@ -578,6 +596,10 @@ class GSM8K(LmEvalEvaluator):
                   type=str,
                   default=None,
                   help="Path to save evaluation results.")
+    @click.option("--output_dir",
+                  type=str,
+                  default=None,
+                  help="Directory to save the task infos.")
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
@@ -640,6 +662,10 @@ class GPQADiamond(LmEvalEvaluator):
                   type=str,
                   default=None,
                   help="Path to save evaluation results.")
+    @click.option("--output_dir",
+                  type=str,
+                  default=None,
+                  help="Directory to save the task infos.")
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
@@ -698,6 +724,10 @@ class GPQAMain(LmEvalEvaluator):
                   type=str,
                   default=None,
                   help="Path to save evaluation results.")
+    @click.option("--output_dir",
+                  type=str,
+                  default=None,
+                  help="Directory to save the task infos.")
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
@@ -756,6 +786,10 @@ class GPQAExtended(LmEvalEvaluator):
                   type=str,
                   default=None,
                   help="Path to save evaluation results.")
+    @click.option("--output_dir",
+                  type=str,
+                  default=None,
+                  help="Directory to save the task infos.")
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
@@ -815,6 +849,10 @@ class MMMU(LmEvalEvaluator):
                   type=str,
                   default=None,
                   help="Path to save evaluation results.")
+    @click.option("--output_dir",
+                  type=str,
+                  default=None,
+                  help="Directory to save the task infos.")
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
@@ -971,6 +1009,10 @@ class LongBenchV1(LmEvalEvaluator):
                   type=str,
                   default=None,
                   help="Path to save evaluation results.")
+    @click.option("--output_dir",
+                  type=str,
+                  default=None,
+                  help="Directory to save the task infos.")
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
@@ -984,7 +1026,8 @@ class LongBenchV1(LmEvalEvaluator):
             system_prompt=kwargs.pop("system_prompt", None),
             chat_template_kwargs=kwargs.pop("chat_template_kwargs", None),
             log_samples=kwargs.pop("log_samples", False),
-            output_path=kwargs.pop("output_path", None))
+            output_path=kwargs.pop("output_path", None),
+            output_dir=kwargs.pop("output_dir", None))
 
         # Let lm-eval task configs control sampling via gen_kwargs.
         sampling_params = None
diff --git a/tensorrt_llm/evaluate/longbench_v2.py b/tensorrt_llm/evaluate/longbench_v2.py
index 1f28aa2cec..1b2ccaf221 100644
--- a/tensorrt_llm/evaluate/longbench_v2.py
+++ b/tensorrt_llm/evaluate/longbench_v2.py
@@ -62,7 +62,9 @@ class LongBenchV2(Evaluator):
                  cot: bool = False,
                  no_context: bool = False,
                  rag: int = 0,
+                 max_len: int = 128000,
                  max_input_length: int = 128000,
+                 max_output_length: int = 32000,
                  output_dir: Optional[str] = None,
                  random_seed: int = 0,
                  apply_chat_template: bool = False,
@@ -81,8 +83,10 @@ class LongBenchV2(Evaluator):
             cot: Enable Chain-of-Thought reasoning
             no_context: Test without context (memorization test)
             rag: Number of top retrieved contexts to use (0 to disable)
-            max_input_length: Maximum prompt length in tokens for truncation
-            output_dir: Directory to save evaluation results
+            max_len: Maximum length (input + output) in tokens
+            max_input_length: Maximum context length in tokens. If exceeds, the prompt will be truncated in the middle.
+            max_output_length: Maximum output length in tokens for truncation
+            output_dir: Directory to save the task infos
             random_seed: Random seed for reproducibility
             apply_chat_template: Whether to apply model's chat template
             system_prompt: System prompt to prepend
@@ -91,7 +95,8 @@ class LongBenchV2(Evaluator):
         super().__init__(random_seed=random_seed,
                          apply_chat_template=apply_chat_template,
                          system_prompt=system_prompt,
-                         chat_template_kwargs=chat_template_kwargs)
+                         chat_template_kwargs=chat_template_kwargs,
+                         output_dir=output_dir)
 
         self.dataset_path = dataset_path
         self.num_samples = num_samples
@@ -103,7 +108,9 @@ class LongBenchV2(Evaluator):
         self.no_context = no_context
         self.rag = rag
         self.output_dir = output_dir
-        self.max_input_length = max_input_length
+        # We need to minus max_output_length from max_len to reserve budget for output tokens.
+        self.max_input_length = min(max_input_length,
+                                    max_len - max_output_length)
 
         # Will be set during evaluation
         self.tokenizer = None
@@ -305,7 +312,6 @@ class LongBenchV2(Evaluator):
 
         If the prompt exceeds max_input_length, it takes the first half and last half
         to preserve both context beginning and end.
-        We need to minus max_output_length from max_len to reserve budget for output tokens.
 
         Args:
             prompt: The prompt string to truncate
@@ -714,7 +720,7 @@ class LongBenchV2(Evaluator):
     @click.option("--output_dir",
                   type=str,
                   default=None,
-                  help="Directory to save results.")
+                  help="Directory to save the task infos.")
     @click.option("--random_seed",
                   type=int,
                   default=0,
@@ -727,12 +733,19 @@ class LongBenchV2(Evaluator):
                   type=str,
                   default=None,
                   help="System prompt.")
+    @click.option(
+        "--max_len",
+        type=int,
+        default=1024000,
+        help=
+        "Maximum length (input + output) in tokens which can be supported by the model."
+    )
     @click.option(
         "--max_input_length",
         type=int,
         default=128000,
         help=
-        "Maximum prompt length before apply chat template. If exceeds, the prompt will be truncated in the middle."
+        "Maximum context length in tokens. If exceeds, the prompt will be truncated in the middle."
     )
     @click.option("--max_output_length",
                   type=int,
@@ -763,7 +776,7 @@ class LongBenchV2(Evaluator):
                 cot: bool, no_context: bool, rag: int,
                 output_dir: Optional[str], random_seed: int,
                 apply_chat_template: bool, system_prompt: Optional[str],
-                max_input_length: int, max_output_length: int,
+                max_len: int, max_input_length: int, max_output_length: int,
                 chat_template_kwargs: Optional[dict[str, Any]],
                 temperature: float, top_p: float) -> None:
         llm: Union[LLM, PyTorchLLM] = ctx.obj
@@ -782,7 +795,9 @@ class LongBenchV2(Evaluator):
                                 cot=cot,
                                 no_context=no_context,
                                 rag=rag,
+                                max_len=max_len,
                                 max_input_length=max_input_length,
+                                max_output_length=max_output_length,
                                 output_dir=output_dir,
                                 random_seed=random_seed,
                                 apply_chat_template=apply_chat_template,
diff --git a/tensorrt_llm/evaluate/mmlu.py b/tensorrt_llm/evaluate/mmlu.py
index 89be382396..43b4d0dc34 100644
--- a/tensorrt_llm/evaluate/mmlu.py
+++ b/tensorrt_llm/evaluate/mmlu.py
@@ -121,11 +121,13 @@ class MMLU(Evaluator):
                  random_seed: int = 0,
                  apply_chat_template: bool = False,
                  system_prompt: Optional[str] = None,
-                 chat_template_kwargs: Optional[dict[str, Any]] = None):
+                 chat_template_kwargs: Optional[dict[str, Any]] = None,
+                 output_dir: Optional[str] = None):
         super().__init__(random_seed=random_seed,
                          apply_chat_template=apply_chat_template,
                          system_prompt=system_prompt,
-                         chat_template_kwargs=chat_template_kwargs)
+                         chat_template_kwargs=chat_template_kwargs,
+                         output_dir=output_dir)
         if dataset_path is None:
             dataset_path = self.dowload_dataset()
         self.dataset_path = dataset_path
@@ -302,6 +304,10 @@ class MMLU(Evaluator):
                   help="Maximum generation length.")
     @click.option("--check_accuracy", is_flag=True, default=False)
     @click.option("--accuracy_threshold", type=float, default=30)
+    @click.option("--output_dir",
+                  type=str,
+                  default=None,
+                  help="Directory to save the task infos.")
     @click.pass_context
     @staticmethod
     def command(ctx, dataset_path: Optional[str], num_samples: int,
@@ -309,7 +315,7 @@ class MMLU(Evaluator):
                 chat_template_kwargs: Optional[dict[str, Any]],
                 system_prompt: Optional[str], max_input_length: int,
                 max_output_length: int, check_accuracy: bool,
-                accuracy_threshold: float) -> None:
+                accuracy_threshold: float, output_dir: Optional[str]) -> None:
         llm: Union[LLM, PyTorchLLM] = ctx.obj
         sampling_params = SamplingParams(
             max_tokens=max_output_length,
@@ -320,7 +326,8 @@ class MMLU(Evaluator):
                          random_seed=random_seed,
                          apply_chat_template=apply_chat_template,
                          system_prompt=system_prompt,
-                         chat_template_kwargs=chat_template_kwargs)
+                         chat_template_kwargs=chat_template_kwargs,
+                         output_dir=output_dir)
         accuracy = evaluator.evaluate(llm, sampling_params)
         llm.shutdown()