mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* Update TensorRT-LLM --------- Co-authored-by: RunningLeon <mnsheng@yeah.net> Co-authored-by: Tlntin <TlntinDeng01@Gmail.com> Co-authored-by: ZHENG, Zhen <zhengzhen.z@qq.com> Co-authored-by: Pham Van Ngoan <ngoanpham1196@gmail.com> Co-authored-by: Nathan Price <nathan@abridge.com> Co-authored-by: Tushar Goel <tushar.goel.ml@gmail.com> Co-authored-by: Mati <132419219+matichon-vultureprime@users.noreply.github.com>
177 lines
5.3 KiB
Python
177 lines
5.3 KiB
Python
import json
|
|
import os
|
|
from enum import Enum
|
|
|
|
import evaluate
|
|
import nltk
|
|
import numpy as np
|
|
import pandas as pd
|
|
from transformers import AutoTokenizer, LlamaTokenizerFast
|
|
|
|
nltk.download("punkt", quiet=False)
|
|
import argparse
|
|
|
|
|
|
class Model(Enum):
|
|
Llama_v2_70B = 1
|
|
GPT_J = 2
|
|
|
|
|
|
ACCURACY_TARGETS = {
|
|
Model.Llama_v2_70B: {
|
|
"rouge1": 44.4312 * 0.999,
|
|
"rouge2": 22.0352 * 0.999,
|
|
"rougeL": 28.6162 * 0.999,
|
|
"tokens_per_sample": 294.45 * 0.9
|
|
},
|
|
Model.GPT_J: {
|
|
"rouge1": 42.9435135,
|
|
"rouge2": 20.1033765,
|
|
"rougeL": 29.9581119,
|
|
# "tokens_per_sample": ??
|
|
}
|
|
}
|
|
|
|
|
|
def get_reference_df(processed_dataset_file):
|
|
data = pd.read_pickle(processed_dataset_file)
|
|
return data["output"].tolist()
|
|
|
|
|
|
def get_reference_json(cnn_dailymail_valset):
|
|
# Load from CNN dailymail
|
|
with open(cnn_dailymail_valset, 'r') as fh:
|
|
list_data_dict = json.load(fh)
|
|
|
|
targets = [f"{example['output']}" for example in list_data_dict]
|
|
|
|
print(f"Loaded {len(targets)} samples from {cnn_dailymail_valset}")
|
|
return targets
|
|
|
|
|
|
def get_responses_json(response_file):
|
|
f = open(response_file)
|
|
responses = json.load(f)
|
|
ordered_responses = sorted(responses, key=lambda x: int(x['response_id']))
|
|
return ordered_responses
|
|
|
|
|
|
def postprocess_text(preds, targets):
|
|
# Post-process output texts for ROUGE evaluation
|
|
preds = [pred.strip() for pred in preds]
|
|
targets = [target.strip() for target in targets]
|
|
|
|
# rougeLSum expects newline after each sentence
|
|
preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
|
|
targets = ["\n".join(nltk.sent_tokenize(target)) for target in targets]
|
|
|
|
return preds, targets
|
|
|
|
|
|
def strip_eos(pred_toks, eos_id):
|
|
while len(pred_toks) > 0 and pred_toks[-1] == eos_id:
|
|
pred_toks.pop()
|
|
if len(pred_toks) == 0:
|
|
raise RuntimeError("Empty output sequence detected with EOS")
|
|
return pred_toks
|
|
|
|
|
|
def calculate_toks_per_sample(preds, eos_id):
|
|
preds = [strip_eos(pred, eos_id) for pred in preds]
|
|
avg_len = sum(len(pred) for pred in preds)
|
|
num_samples = len(preds)
|
|
return avg_len / num_samples
|
|
|
|
|
|
def calculate_rouge_score(preds, targets, rouge_dir=None):
|
|
print("Calculating ROUGE scores...")
|
|
rouge_dir = rouge_dir if rouge_dir and os.path.exists(
|
|
rouge_dir) else "rouge"
|
|
metric = evaluate.load(rouge_dir)
|
|
preds, targets = postprocess_text(preds, targets[0:len(preds)])
|
|
result = metric.compute(predictions=preds,
|
|
references=targets,
|
|
use_stemmer=True,
|
|
use_aggregator=False)
|
|
result = {k: round(np.mean(v) * 100, 4) for k, v in result.items()}
|
|
prediction_lens = [len(pred) for pred in preds]
|
|
result["gen_len"] = np.sum(prediction_lens)
|
|
result["gen_num"] = len(preds)
|
|
|
|
return result
|
|
|
|
|
|
def parse_arguments():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--dataset",
|
|
type=str,
|
|
help=
|
|
"Path to the reference dataset against which the responses are evaluated for accuracy. MLPerf uses open-orca (pkl) and cnn-dailymail (np) for Llama2-70B and GPT-J respectively."
|
|
)
|
|
parser.add_argument(
|
|
"--responses",
|
|
type=str,
|
|
help="Path to the json file holding the responses from our benchmark run"
|
|
)
|
|
parser.add_argument("--base_model",
|
|
type=str,
|
|
help="Location of the model used (to create tokenizer)")
|
|
|
|
parser.add_argument(
|
|
'--rouge_dir',
|
|
default=None,
|
|
type=str,
|
|
help=
|
|
"evaluate.load('rouge') will attempt to pull rouge package from HF. Use cached rouge can avoid network outage of host or HF."
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
return args
|
|
|
|
|
|
def main():
|
|
args = parse_arguments()
|
|
|
|
if args.dataset.lower().endswith(".pkl"):
|
|
target_texts = get_reference_df(args.dataset)
|
|
model = Model.Llama_v2_70B
|
|
tokenizer = LlamaTokenizerFast.from_pretrained(args.base_model)
|
|
relaxing_factor = 1.0
|
|
elif args.dataset.lower().endswith(".json"):
|
|
target_texts = get_reference_json(args.dataset)
|
|
model = Model.GPT_J
|
|
tokenizer = AutoTokenizer.from_pretrained(args.base_model,
|
|
model_max_length=2047,
|
|
padding_side="left",
|
|
use_fast=False)
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
relaxing_factor = 0.93
|
|
else:
|
|
raise RuntimeError(
|
|
"Dataset expected to be pkl (open-orca) or json (cnn-dailymail)")
|
|
|
|
pred_out = get_responses_json(args.responses)
|
|
pred_toks = [x['response_tokens'] for x in pred_out]
|
|
|
|
tps_score = calculate_toks_per_sample(pred_toks, tokenizer.eos_token)
|
|
|
|
pred_texts = tokenizer.batch_decode(pred_toks, skip_special_tokens=True)
|
|
achieved_scores = calculate_rouge_score(pred_texts, target_texts,
|
|
args.rouge_dir)
|
|
|
|
achieved_scores['tokens_per_sample'] = tps_score
|
|
targets = ACCURACY_TARGETS[model]
|
|
|
|
print("Achieved rouge scores: ", achieved_scores)
|
|
print("Tokens per sample: ", tps_score)
|
|
print("Targets: ", targets)
|
|
|
|
for k, _ in targets.items():
|
|
assert targets[k] * relaxing_factor <= achieved_scores[k]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|