This commit is contained in:
Zhenhuan Chen 2026-01-13 21:15:45 +08:00 committed by GitHub
commit 260eacee4b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 83 additions and 7 deletions

View File

@ -28,7 +28,7 @@ jieba==0.42.1
rouge==1.0.1
pytest-rerunfailures
ruff==0.9.4
lm_eval[api]==0.4.9.2
lm_eval[api]==0.4.9.1 # 0.4.9.2 bug for custom config: https://github.com/EleutherAI/lm-evaluation-harness/pull/3436
docstring_parser
genai-perf==0.0.13
opentelemetry-sdk>=1.26.0

View File

@ -16,6 +16,7 @@ import copy
import json
import os
from contextlib import contextmanager
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
import click
@ -334,7 +335,9 @@ class LmEvalEvaluator(Evaluator):
fewshot_as_multiturn: bool = False,
system_prompt: Optional[str] = None,
is_multimodal: bool = False,
chat_template_kwargs: Optional[dict[str, Any]] = None):
chat_template_kwargs: Optional[dict[str, Any]] = None,
log_samples: bool = False,
output_path: Optional[str] = None):
try:
import lm_eval
except ImportError as e:
@ -357,6 +360,8 @@ class LmEvalEvaluator(Evaluator):
self.task_name = task_name
self.dataset_path = dataset_path
self.num_samples = num_samples
self.log_samples = log_samples
self.output_path = output_path
task_manager = TaskManager(
include_path=f"{os.path.dirname(__file__)}/lm_eval_tasks")
@ -436,6 +441,14 @@ class LmEvalEvaluator(Evaluator):
*auxiliaries) -> float:
raise NotImplementedError()
def save_results(self, results: dict) -> None:
path = Path(self.output_path)
path.mkdir(parents=True, exist_ok=True)
result_path = (path / f"samples_{self.task_name}.json")
with open(result_path, "w") as f:
json.dump(results, f, indent=2)
logger.info(f"Results saved to {result_path}")
def evaluate(self,
llm: Union[LLM, PyTorchLLM],
sampling_params: Optional[SamplingParams] = None,
@ -456,7 +469,9 @@ class LmEvalEvaluator(Evaluator):
limit=self.num_samples,
apply_chat_template=self.apply_chat_template,
fewshot_as_multiturn=self.fewshot_as_multiturn,
system_instruction=self.system_prompt)
system_instruction=self.system_prompt,
log_samples=self.log_samples)
# Normalize scores to range 0~100
scores = results["results"][self.task_name]
for metric in scores.keys():
@ -465,6 +480,11 @@ class LmEvalEvaluator(Evaluator):
logger.info(
f"lm-eval {self.task_name} results (scores normalized to range 0~100):\n{lm_eval.utils.make_table(results)}"
)
# Save results if output_path is specified
if self.output_path:
self.save_results(results)
if scores_filter is not None:
result_acc = results["results"][self.task_name][scores_filter]
logger.info(
@ -491,7 +511,9 @@ class LmEvalEvaluator(Evaluator):
system_prompt=kwargs.pop("system_prompt", None),
is_multimodal=kwargs.pop("is_multimodal", False),
chat_template_kwargs=kwargs.pop("chat_template_kwargs",
None))
None),
log_samples=kwargs.pop("log_samples", False),
output_path=kwargs.pop("output_path", None))
sampling_params = SamplingParams(
max_tokens=kwargs.pop("max_output_length"),
truncate_prompt_tokens=kwargs.pop("max_input_length"),
@ -548,6 +570,14 @@ class GSM8K(LmEvalEvaluator):
type=int,
default=256,
help="Maximum generation length.")
@click.option("--log_samples",
is_flag=True,
default=False,
help="Log sample outputs for debugging.")
@click.option("--output_path",
type=str,
default=None,
help="Path to save evaluation results.")
@click.pass_context
@staticmethod
def command(ctx, **kwargs) -> None:
@ -602,6 +632,14 @@ class GPQADiamond(LmEvalEvaluator):
type=int,
default=32768,
help="Maximum generation length.")
@click.option("--log_samples",
is_flag=True,
default=False,
help="Log sample outputs for debugging.")
@click.option("--output_path",
type=str,
default=None,
help="Path to save evaluation results.")
@click.pass_context
@staticmethod
def command(ctx, **kwargs) -> None:
@ -652,6 +690,14 @@ class GPQAMain(LmEvalEvaluator):
type=int,
default=32768,
help="Maximum generation length.")
@click.option("--log_samples",
is_flag=True,
default=False,
help="Log sample outputs for debugging.")
@click.option("--output_path",
type=str,
default=None,
help="Path to save evaluation results.")
@click.pass_context
@staticmethod
def command(ctx, **kwargs) -> None:
@ -702,6 +748,14 @@ class GPQAExtended(LmEvalEvaluator):
type=int,
default=32768,
help="Maximum generation length.")
@click.option("--log_samples",
is_flag=True,
default=False,
help="Log sample outputs for debugging.")
@click.option("--output_path",
type=str,
default=None,
help="Path to save evaluation results.")
@click.pass_context
@staticmethod
def command(ctx, **kwargs) -> None:
@ -753,6 +807,14 @@ class MMMU(LmEvalEvaluator):
default=
512, # NOTE: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmmu/_template_yaml#L13
help="Maximum generation length.")
@click.option("--log_samples",
is_flag=True,
default=False,
help="Log sample outputs for debugging.")
@click.option("--output_path",
type=str,
default=None,
help="Path to save evaluation results.")
@click.pass_context
@staticmethod
def command(ctx, **kwargs) -> None:
@ -823,12 +885,16 @@ class LongBenchV1(LmEvalEvaluator):
limit=self.num_samples,
apply_chat_template=self.apply_chat_template,
fewshot_as_multiturn=self.fewshot_as_multiturn,
system_instruction=self.system_prompt)
system_instruction=self.system_prompt,
log_samples=self.log_samples)
logger.info(
f"lm-eval {self.task_name} results:\n{lm_eval.utils.make_table(results)}"
)
# Save results if output_path is specified
if self.output_path:
self.save_results(results)
# LongBench is a group task in lm-eval. lm-eval already computes subgroup
# "score" values (e.g., `longbench_fewshot`, `longbench_single`, ...).
# To keep this implementation simple and aligned with the printed table,
@ -897,6 +963,14 @@ class LongBenchV1(LmEvalEvaluator):
type=str,
default=None,
help="System prompt.")
@click.option("--log_samples",
is_flag=True,
default=False,
help="Log sample outputs for debugging.")
@click.option("--output_path",
type=str,
default=None,
help="Path to save evaluation results.")
@click.pass_context
@staticmethod
def command(ctx, **kwargs) -> None:
@ -908,7 +982,9 @@ class LongBenchV1(LmEvalEvaluator):
random_seed=kwargs.pop("random_seed", 0),
apply_chat_template=kwargs.pop("apply_chat_template", True),
system_prompt=kwargs.pop("system_prompt", None),
chat_template_kwargs=kwargs.pop("chat_template_kwargs", None))
chat_template_kwargs=kwargs.pop("chat_template_kwargs", None),
log_samples=kwargs.pop("log_samples", False),
output_path=kwargs.pop("output_path", None))
# Let lm-eval task configs control sampling via gen_kwargs.
sampling_params = None