mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Merge b3d0f4a883 into 6df2c8a074
This commit is contained in:
commit
260eacee4b
@ -28,7 +28,7 @@ jieba==0.42.1
|
||||
rouge==1.0.1
|
||||
pytest-rerunfailures
|
||||
ruff==0.9.4
|
||||
lm_eval[api]==0.4.9.2
|
||||
lm_eval[api]==0.4.9.1 # 0.4.9.2 bug for custom config: https://github.com/EleutherAI/lm-evaluation-harness/pull/3436
|
||||
docstring_parser
|
||||
genai-perf==0.0.13
|
||||
opentelemetry-sdk>=1.26.0
|
||||
|
||||
@ -16,6 +16,7 @@ import copy
|
||||
import json
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import click
|
||||
@ -334,7 +335,9 @@ class LmEvalEvaluator(Evaluator):
|
||||
fewshot_as_multiturn: bool = False,
|
||||
system_prompt: Optional[str] = None,
|
||||
is_multimodal: bool = False,
|
||||
chat_template_kwargs: Optional[dict[str, Any]] = None):
|
||||
chat_template_kwargs: Optional[dict[str, Any]] = None,
|
||||
log_samples: bool = False,
|
||||
output_path: Optional[str] = None):
|
||||
try:
|
||||
import lm_eval
|
||||
except ImportError as e:
|
||||
@ -357,6 +360,8 @@ class LmEvalEvaluator(Evaluator):
|
||||
self.task_name = task_name
|
||||
self.dataset_path = dataset_path
|
||||
self.num_samples = num_samples
|
||||
self.log_samples = log_samples
|
||||
self.output_path = output_path
|
||||
|
||||
task_manager = TaskManager(
|
||||
include_path=f"{os.path.dirname(__file__)}/lm_eval_tasks")
|
||||
@ -436,6 +441,14 @@ class LmEvalEvaluator(Evaluator):
|
||||
*auxiliaries) -> float:
|
||||
raise NotImplementedError()
|
||||
|
||||
def save_results(self, results: dict) -> None:
|
||||
path = Path(self.output_path)
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
result_path = (path / f"samples_{self.task_name}.json")
|
||||
with open(result_path, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
logger.info(f"Results saved to {result_path}")
|
||||
|
||||
def evaluate(self,
|
||||
llm: Union[LLM, PyTorchLLM],
|
||||
sampling_params: Optional[SamplingParams] = None,
|
||||
@ -456,7 +469,9 @@ class LmEvalEvaluator(Evaluator):
|
||||
limit=self.num_samples,
|
||||
apply_chat_template=self.apply_chat_template,
|
||||
fewshot_as_multiturn=self.fewshot_as_multiturn,
|
||||
system_instruction=self.system_prompt)
|
||||
system_instruction=self.system_prompt,
|
||||
log_samples=self.log_samples)
|
||||
|
||||
# Normalize scores to range 0~100
|
||||
scores = results["results"][self.task_name]
|
||||
for metric in scores.keys():
|
||||
@ -465,6 +480,11 @@ class LmEvalEvaluator(Evaluator):
|
||||
logger.info(
|
||||
f"lm-eval {self.task_name} results (scores normalized to range 0~100):\n{lm_eval.utils.make_table(results)}"
|
||||
)
|
||||
|
||||
# Save results if output_path is specified
|
||||
if self.output_path:
|
||||
self.save_results(results)
|
||||
|
||||
if scores_filter is not None:
|
||||
result_acc = results["results"][self.task_name][scores_filter]
|
||||
logger.info(
|
||||
@ -491,7 +511,9 @@ class LmEvalEvaluator(Evaluator):
|
||||
system_prompt=kwargs.pop("system_prompt", None),
|
||||
is_multimodal=kwargs.pop("is_multimodal", False),
|
||||
chat_template_kwargs=kwargs.pop("chat_template_kwargs",
|
||||
None))
|
||||
None),
|
||||
log_samples=kwargs.pop("log_samples", False),
|
||||
output_path=kwargs.pop("output_path", None))
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=kwargs.pop("max_output_length"),
|
||||
truncate_prompt_tokens=kwargs.pop("max_input_length"),
|
||||
@ -548,6 +570,14 @@ class GSM8K(LmEvalEvaluator):
|
||||
type=int,
|
||||
default=256,
|
||||
help="Maximum generation length.")
|
||||
@click.option("--log_samples",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Log sample outputs for debugging.")
|
||||
@click.option("--output_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to save evaluation results.")
|
||||
@click.pass_context
|
||||
@staticmethod
|
||||
def command(ctx, **kwargs) -> None:
|
||||
@ -602,6 +632,14 @@ class GPQADiamond(LmEvalEvaluator):
|
||||
type=int,
|
||||
default=32768,
|
||||
help="Maximum generation length.")
|
||||
@click.option("--log_samples",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Log sample outputs for debugging.")
|
||||
@click.option("--output_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to save evaluation results.")
|
||||
@click.pass_context
|
||||
@staticmethod
|
||||
def command(ctx, **kwargs) -> None:
|
||||
@ -652,6 +690,14 @@ class GPQAMain(LmEvalEvaluator):
|
||||
type=int,
|
||||
default=32768,
|
||||
help="Maximum generation length.")
|
||||
@click.option("--log_samples",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Log sample outputs for debugging.")
|
||||
@click.option("--output_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to save evaluation results.")
|
||||
@click.pass_context
|
||||
@staticmethod
|
||||
def command(ctx, **kwargs) -> None:
|
||||
@ -702,6 +748,14 @@ class GPQAExtended(LmEvalEvaluator):
|
||||
type=int,
|
||||
default=32768,
|
||||
help="Maximum generation length.")
|
||||
@click.option("--log_samples",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Log sample outputs for debugging.")
|
||||
@click.option("--output_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to save evaluation results.")
|
||||
@click.pass_context
|
||||
@staticmethod
|
||||
def command(ctx, **kwargs) -> None:
|
||||
@ -753,6 +807,14 @@ class MMMU(LmEvalEvaluator):
|
||||
default=
|
||||
512, # NOTE: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmmu/_template_yaml#L13
|
||||
help="Maximum generation length.")
|
||||
@click.option("--log_samples",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Log sample outputs for debugging.")
|
||||
@click.option("--output_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to save evaluation results.")
|
||||
@click.pass_context
|
||||
@staticmethod
|
||||
def command(ctx, **kwargs) -> None:
|
||||
@ -823,12 +885,16 @@ class LongBenchV1(LmEvalEvaluator):
|
||||
limit=self.num_samples,
|
||||
apply_chat_template=self.apply_chat_template,
|
||||
fewshot_as_multiturn=self.fewshot_as_multiturn,
|
||||
system_instruction=self.system_prompt)
|
||||
|
||||
system_instruction=self.system_prompt,
|
||||
log_samples=self.log_samples)
|
||||
logger.info(
|
||||
f"lm-eval {self.task_name} results:\n{lm_eval.utils.make_table(results)}"
|
||||
)
|
||||
|
||||
# Save results if output_path is specified
|
||||
if self.output_path:
|
||||
self.save_results(results)
|
||||
|
||||
# LongBench is a group task in lm-eval. lm-eval already computes subgroup
|
||||
# "score" values (e.g., `longbench_fewshot`, `longbench_single`, ...).
|
||||
# To keep this implementation simple and aligned with the printed table,
|
||||
@ -897,6 +963,14 @@ class LongBenchV1(LmEvalEvaluator):
|
||||
type=str,
|
||||
default=None,
|
||||
help="System prompt.")
|
||||
@click.option("--log_samples",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Log sample outputs for debugging.")
|
||||
@click.option("--output_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to save evaluation results.")
|
||||
@click.pass_context
|
||||
@staticmethod
|
||||
def command(ctx, **kwargs) -> None:
|
||||
@ -908,7 +982,9 @@ class LongBenchV1(LmEvalEvaluator):
|
||||
random_seed=kwargs.pop("random_seed", 0),
|
||||
apply_chat_template=kwargs.pop("apply_chat_template", True),
|
||||
system_prompt=kwargs.pop("system_prompt", None),
|
||||
chat_template_kwargs=kwargs.pop("chat_template_kwargs", None))
|
||||
chat_template_kwargs=kwargs.pop("chat_template_kwargs", None),
|
||||
log_samples=kwargs.pop("log_samples", False),
|
||||
output_path=kwargs.pop("output_path", None))
|
||||
|
||||
# Let lm-eval task configs control sampling via gen_kwargs.
|
||||
sampling_params = None
|
||||
|
||||
Loading…
Reference in New Issue
Block a user