# MIT License # Copyright (c) 2023 OpenBMB # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # reference: https://github.com/OpenBMB/InfiniteBench/blob/main/src/eval_yarn_mistral.py import argparse import ast import json from pathlib import Path import torch from infinitebench.compute_scores import compute_scores from infinitebench.eval_utils import (DATA_NAME_TO_MAX_NEW_TOKENS, create_prompt, dump_jsonl, get_answer, load_data) from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, add_common_args, load_tokenizer, read_model_name) import tensorrt_llm import tensorrt_llm.profiler as profiler from tensorrt_llm.logger import logger from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner if PYTHON_BINDINGS: from tensorrt_llm.runtime import ModelRunnerCpp MAX_POSITION_ID = 128 * 1024 # Determined by the model TRUNCATE_LEN = 128 * 1024 def parse_arguments(args=None): parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, default=1) parser.add_argument('--max_input_length', type=int, default=923) parser.add_argument('--output_log_probs_npy', type=str, help='Numpy file where the log_probs are stored', default=None) parser.add_argument('--output_cum_log_probs_npy', type=str, help='Numpy file where the cum_log_probs are stored', default=None) parser.add_argument( "--task", type=str, choices=['passkey', 'kv_retrieval'], required=True, help= "Which task to use. Note that \"all\" can only be used in `compute_scores.py`.", # noqa ) parser.add_argument('--data_dir', type=str, default='./', help="The directory of data.") parser.add_argument("--output_dir", type=str, default=None, help="Where to dump the prediction results.") # noqa parser.add_argument( "--start_idx", type=int, default=0, help= "The index of the first example to infer on. This is used if you want to evaluate on a (contiguous) subset of the data." ) # noqa parser.add_argument( "--stop_idx", type=int, help= "The index of the last example to infer on. This is used if you want to evaluate on a (contiguous) subset of the data. Defaults to the length of dataset." ) # noqa parser.add_argument('--tensorrt_llm_accuracy_threshold', type=float, default=99) parser = add_common_args(parser) return parser.parse_args(args=args) def parse_input(tokenizer, input_text=None, prompt_template=None, add_special_tokens=True, max_input_length=923, pad_id=None, num_prepend_vtokens=[], model_name=None, model_version=None): if pad_id is None: pad_id = tokenizer.pad_token_id batch_input_ids = [] for curr_text in input_text: if prompt_template is not None: curr_text = prompt_template.format(input_text=curr_text) input_ids = tokenizer.encode(curr_text, add_special_tokens=add_special_tokens, truncation=True, max_length=max_input_length) batch_input_ids.append(input_ids) if num_prepend_vtokens: assert len(num_prepend_vtokens) == len(batch_input_ids) base_vocab_size = tokenizer.vocab_size - len( tokenizer.special_tokens_map.get('additional_special_tokens', [])) for i, length in enumerate(num_prepend_vtokens): batch_input_ids[i] = list( range(base_vocab_size, base_vocab_size + length)) + batch_input_ids[i] if 'GLM' in model_name and model_version == 'glm': for ids in batch_input_ids: ids.append(tokenizer.sop_token_id) batch_input_ids = [ torch.tensor(x, dtype=torch.int32) for x in batch_input_ids ] return batch_input_ids def main(args): # model_name = "yarn-mistral" runtime_rank = tensorrt_llm.mpi_rank() logger.set_level(args.log_level) print(json.dumps(vars(args), indent=4)) data_name = args.task # Model max_tokens = DATA_NAME_TO_MAX_NEW_TOKENS[data_name] model_name, model_version = read_model_name(args.engine_dir) if args.tokenizer_dir is None: logger.warning( "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect." ) args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name] tokenizer, pad_id, end_id = load_tokenizer( tokenizer_dir=args.tokenizer_dir, vocab_file=args.vocab_file, model_name=model_name, model_version=model_version, tokenizer_type=args.tokenizer_type, ) if not PYTHON_BINDINGS and not args.use_py_session: logger.warning( "Python bindings of C++ session is unavailable, fallback to Python session." ) args.use_py_session = True if args.debug_mode and not args.use_py_session: logger.warning( "Debug mode is not supported in C++ session for now, fallback to Python session." ) args.use_py_session = True runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp runner_kwargs = dict( engine_dir=args.engine_dir, lora_dir=args.lora_dir, rank=runtime_rank, debug_mode=args.debug_mode, lora_ckpt_source=args.lora_ckpt_source, gpu_weights_percent=args.gpu_weights_percent, ) if args.medusa_choices is not None: args.medusa_choices = ast.literal_eval(args.medusa_choices) assert args.temperature == 1.0, "Medusa should use temperature == 1.0" assert args.num_beams == 1, "Medusa should use num_beams == 1" runner_kwargs.update(medusa_choices=args.medusa_choices) if not args.use_py_session: runner_kwargs.update( max_batch_size=args.batch_size, max_input_len=args.max_input_length, max_output_len=max_tokens, max_beam_width=args.num_beams, max_attention_window_size=args.max_attention_window_size, sink_token_length=args.sink_token_length, max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, kv_cache_free_gpu_memory_fraction=args. kv_cache_free_gpu_memory_fraction, enable_chunked_context=args.enable_chunked_context, ) runner = runner_cls.from_dir(**runner_kwargs) # Data examples = load_data(data_name, data_dir=args.data_dir) if args.stop_idx is None: args.stop_idx = len(examples) output_path = None if runtime_rank == 0: if args.output_dir is not None: result_dir = Path(args.output_dir, model_name) result_dir.mkdir(exist_ok=True, parents=True) if args.stop_idx is None: output_path = (result_dir / f"preds_{data_name}.jsonl") else: output_path = ( result_dir / f"preds_{data_name}_{args.start_idx}-{args.stop_idx}.jsonl" # noqa ) prompt_template = None if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES: prompt_template = DEFAULT_PROMPT_TEMPLATES[model_name] if runtime_rank == 0: preds = [] logger.info("==== Evaluation ====") logger.info(f"# examples: {len(examples)}") logger.info(f"Start index: {args.start_idx}") logger.info(f"Stop index: {args.stop_idx}") logger.info(f"Max tokens: {max_tokens}") assert args.batch_size == 1 profiler.start('Evaluation') for i in range(args.start_idx, args.stop_idx): eg = examples[i] input_text = [create_prompt(eg, data_name, args.data_dir)] batch_input_ids = parse_input( tokenizer=tokenizer, input_text=input_text, prompt_template=prompt_template, add_special_tokens=args.add_special_tokens, max_input_length=args.max_input_length, pad_id=pad_id, num_prepend_vtokens=args.num_prepend_vtokens, model_name=model_name, model_version=model_version) input_lengths = [x.size(0) for x in batch_input_ids] if runtime_rank == 0: logger.debug(f"====== Example {i} ======") logger.debug(f"input_lengths: {input_lengths}") logger.debug(f"input_text: {input_text}") logger.debug(f"answer: {get_answer(eg, data_name)}") outputs = runner.generate( batch_input_ids, max_new_tokens=max_tokens, max_attention_window_size=args.max_attention_window_size, sink_token_length=args.sink_token_length, end_id=end_id, pad_id=pad_id, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, num_beams=args.num_beams, length_penalty=args.length_penalty, early_stopping=args.early_stopping, beam_width_array=args.beam_width_array, repetition_penalty=args.repetition_penalty, presence_penalty=args.presence_penalty, frequency_penalty=args.frequency_penalty, prompt_ignore_length=args.prompt_ignore_length, # stop_words_list=stop_words_list, # bad_words_list=bad_words_list, output_cum_log_probs=(args.output_cum_log_probs_npy != None), output_log_probs=(args.output_log_probs_npy != None), lora_uids=args.lora_task_uids, prompt_table=args.prompt_table_path, prompt_tasks=args.prompt_tasks, streaming=args.streaming, output_sequence_lengths=True, return_dict=True, medusa_choices=args.medusa_choices) torch.cuda.synchronize() if runtime_rank == 0: output_ids = outputs['output_ids'] output_beams_list = [ tokenizer.batch_decode(output_ids[batch_idx, :, input_lengths[batch_idx]:], skip_special_tokens=True) for batch_idx in range(args.batch_size) ] logger.debug(f"preds: {output_beams_list[0]}") preds.append({ "id": i, "prediction": output_beams_list[0][0], "ground_truth": get_answer(eg, data_name), "input_lengths": input_lengths, }) if output_path is not None: dump_jsonl(preds, output_path) profiler.stop('Evaluation') if runtime_rank == 0: logger.info( f'Evaluation takes: {profiler.elapsed_time_in_sec("Evaluation")} sec.' ) logger.info("Compute the score") acc = compute_scores(preds, args.task) * 100 logger.info(f"{args.task} accuracy: {acc:.2f} ({len(preds)})") if args.tensorrt_llm_accuracy_threshold is not None: assert acc >= args.tensorrt_llm_accuracy_threshold, f"acc ({acc}) < tensorrt_llm_accuracy_threshold ({args.tensorrt_llm_accuracy_threshold})" if __name__ == "__main__": args = parse_arguments() main(args)