TensorRT-LLMs/tests/integration/defs/accuracy/scripts/collect_evaluated_accuracies.py
Enwei Zhu 3fa19ffa4e
test [TRTLLM-4477,TRTLLM-4481]: Accuracy test improvement (Part 3.5): Support GSM8K and GPQA (#3483)
* add gsm8k

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* fix gsm8k

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* add gpqa

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* conditional import lm_eval

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* gpqa in lm_eval

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* system prompt

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* shuffle

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* update AA prompt and regex

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* revert AA prompt and regex

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* integration to tests

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* fix

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* add DS-R1

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* fix and clean

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* fix

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* update tests

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* update

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* clean up

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* free_gpu_memory_fraction=0.8

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* fix

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

---------

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
2025-04-22 07:38:16 +08:00

66 lines
2.1 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import re
import pandas as pd
metric_regex = {
"rouge1": r"(?<=rouge1: )\d+\.\d+",
"mmlu": r"(?<=MMLU weighted average accuracy: )\d+\.\d+",
"gsm8k": r"(?<=gsm8k average accuracy: )\d+\.\d+",
"gpqa_diamond":
r"(?<=gpqa_diamond_cot_zeroshot_aa average accuracy: )\d+\.\d+",
"perplexity": r"(?<=Per-token perplexity: )\d+\.\d+",
"passkey": r"(?<=passkey accuracy: )\d+\.\d+"
}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--log_file", type=str, required=True)
parser.add_argument("--output_file", type=str, default=None)
args = parser.parse_args()
with open(args.log_file) as f:
log = f.read()
test_cases = re.search(r"(?<=items in this shard: ).+", log).group()
test_cases = test_cases.split(", ")
data = [{} for _ in test_cases]
log = log.split("\n")
i = -1
for line in log:
if i + 1 < len(test_cases) and line.startswith(test_cases[i + 1]):
# Advance to next test case
i += 1
continue
if i < 0:
continue
entry = data[i]
for metric, regex in metric_regex.items():
if metric in entry:
continue
matched = re.search(regex, line)
if matched:
entry[metric] = float(matched.group())
df = pd.DataFrame(data, index=test_cases)
print(df)
if args.output_file:
df.to_csv(args.output_file)