TensorRT-LLMs/benchmarks/python/check_accuracy_mlperf.py
2024-07-17 20:45:02 +08:00

177 lines
5.3 KiB
Python

import json
import os
from enum import Enum
import evaluate
import nltk
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, LlamaTokenizerFast
nltk.download("punkt", quiet=False)
import argparse
class Model(Enum):
Llama_v2_70B = 1
GPT_J = 2
ACCURACY_TARGETS = {
Model.Llama_v2_70B: {
"rouge1": 44.4312 * 0.999,
"rouge2": 22.0352 * 0.999,
"rougeL": 28.6162 * 0.999,
"tokens_per_sample": 294.45 * 0.9
},
Model.GPT_J: {
"rouge1": 42.9435135,
"rouge2": 20.1033765,
"rougeL": 29.9581119,
# "tokens_per_sample": ??
}
}
def get_reference_df(processed_dataset_file):
data = pd.read_pickle(processed_dataset_file)
return data["output"].tolist()
def get_reference_json(cnn_dailymail_valset):
# Load from CNN dailymail
with open(cnn_dailymail_valset, 'r') as fh:
list_data_dict = json.load(fh)
targets = [f"{example['output']}" for example in list_data_dict]
print(f"Loaded {len(targets)} samples from {cnn_dailymail_valset}")
return targets
def get_responses_json(response_file):
f = open(response_file)
responses = json.load(f)
ordered_responses = sorted(responses, key=lambda x: int(x['response_id']))
return ordered_responses
def postprocess_text(preds, targets):
# Post-process output texts for ROUGE evaluation
preds = [pred.strip() for pred in preds]
targets = [target.strip() for target in targets]
# rougeLSum expects newline after each sentence
preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets]
return preds, targets
def strip_eos(pred_toks, eos_id):
while len(pred_toks) > 0 and pred_toks[-1] == eos_id:
pred_toks.pop()
if len(pred_toks) == 0:
raise RuntimeError("Empty output sequence detected with EOS")
return pred_toks
def calculate_toks_per_sample(preds, eos_id):
preds = [strip_eos(pred, eos_id) for pred in preds]
avg_len = sum(len(pred) for pred in preds)
num_samples = len(preds)
return avg_len / num_samples
def calculate_rouge_score(preds, targets, rouge_dir=None):
print("Calculating ROUGE scores...")
rouge_dir = rouge_dir if rouge_dir and os.path.exists(
rouge_dir) else "rouge"
metric = evaluate.load(rouge_dir)
preds, targets = postprocess_text(preds, targets[0:len(preds)])
result = metric.compute(predictions=preds,
references=targets,
use_stemmer=True,
use_aggregator=False)
result = {k: round(np.mean(v) * 100, 4) for k, v in result.items()}
prediction_lens = [len(pred) for pred in preds]
result["gen_len"] = np.sum(prediction_lens)
result["gen_num"] = len(preds)
return result
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"--dataset",
type=str,
help=
"Path to the reference dataset against which the responses are evaluated for accuracy. MLPerf uses open-orca (pkl) and cnn-dailymail (np) for Llama2-70B and GPT-J respectively."
)
parser.add_argument(
"--responses",
type=str,
help="Path to the json file holding the responses from our benchmark run"
)
parser.add_argument("--base_model",
type=str,
help="Location of the model used (to create tokenizer)")
parser.add_argument(
'--rouge_dir',
default=None,
type=str,
help=
"evaluate.load('rouge') will attempt to pull rouge package from HF. Use cached rouge can avoid network outage of host or HF."
)
args = parser.parse_args()
return args
def main():
args = parse_arguments()
if args.dataset.lower().endswith(".pkl"):
target_texts = get_reference_df(args.dataset)
model = Model.Llama_v2_70B
tokenizer = LlamaTokenizerFast.from_pretrained(args.base_model)
relaxing_factor = 1.0
elif args.dataset.lower().endswith(".json"):
target_texts = get_reference_json(args.dataset)
model = Model.GPT_J
tokenizer = AutoTokenizer.from_pretrained(args.base_model,
model_max_length=2047,
padding_side="left",
use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
relaxing_factor = 0.93
else:
raise RuntimeError(
"Dataset expected to be pkl (open-orca) or json (cnn-dailymail)")
pred_out = get_responses_json(args.responses)
pred_toks = [x['response_tokens'] for x in pred_out]
tps_score = calculate_toks_per_sample(pred_toks, tokenizer.eos_token)
pred_texts = tokenizer.batch_decode(pred_toks, skip_special_tokens=True)
achieved_scores = calculate_rouge_score(pred_texts, target_texts,
args.rouge_dir)
achieved_scores['tokens_per_sample'] = tps_score
targets = ACCURACY_TARGETS[model]
print("Achieved rouge scores: ", achieved_scores)
print("Tokens per sample: ", tps_score)
print("Targets: ", targets)
for k, _ in targets.items():
assert targets[k] * relaxing_factor <= achieved_scores[k]
if __name__ == "__main__":
main()