TensorRT-LLMs/examples/bert/run.py

# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os

# isort: off
import torch
import tensorrt as trt
# isort: on

from transformers import AutoConfig, AutoTokenizer
from utils import (compare_bertcls_result, compare_bertqa_result,
                   decode_bertcls_output, decode_bertqa_output, get_engine_name,
                   intermediate_check, prepare_text_inputs, process_input,
                   temporary_datasets_config)

import tensorrt_llm
from tensorrt_llm import logger
from tensorrt_llm._utils import trt_dtype_to_torch
from tensorrt_llm.runtime import Session, TensorInfo

from transformers import BertConfig, BertPreTrainedModel, BertForQuestionAnswering, BertForSequenceClassification, BertModel  # isort:skip
from transformers import RobertaConfig, RobertaPreTrainedModel, RobertaForQuestionAnswering, RobertaForSequenceClassification, RobertaModel  # isort:skip

OUTPUT_NAME_MAPPING = {
    'BertModel': 'hidden_states',
    'BertForQuestionAnswering': 'logits',
    'BertForSequenceClassification': 'logits',
    'RobertaModel': 'hidden_states',
    'RobertaForQuestionAnswering': 'logits',
    'RobertaForSequenceClassification': 'logits'
}


def parse_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument('--log_level', type=str, default='info')
    parser.add_argument('--engine_dir', type=str, default='bert_outputs')
    parser.add_argument('--hf_model_dir', type=str, required=True)
    parser.add_argument('--run_hf_test', action='store_true')
    parser.add_argument('--remove_input_padding', action='store_true')
    parser.add_argument('--debug', action='store_true')

    return parser.parse_args()


if __name__ == '__main__':
    args = parse_arguments()

    tensorrt_llm.logger.set_level(args.log_level)

    config_path = os.path.join(args.engine_dir, 'config.json')
    with open(config_path, 'r') as f:
        config = json.load(f)

    remove_padding = config['build_config']['plugin_config'][
        'remove_input_padding']
    assert args.remove_input_padding == remove_padding, \
        f"The engine is build with remove_input_padding={remove_padding}, \
        but the inference runtime is performed with remove_input_padding={args.remove_input_padding}!"

    world_size = config['pretrained_config']['mapping']['world_size']
    assert world_size == tensorrt_llm.mpi_world_size(), \
        f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'

    model_name = config['pretrained_config']['architecture']

    # Roberta doesn't have token_type_ids, use all zeros to replace
    is_roberta = "Roberta" in model_name

    runtime_rank = tensorrt_llm.mpi_rank() if world_size > 1 else 0

    runtime_mapping = tensorrt_llm.Mapping(world_size,
                                           runtime_rank,
                                           tp_size=world_size)
    torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)

    serialize_path = get_engine_name(runtime_rank)
    serialize_path = os.path.join(args.engine_dir, serialize_path)

    stream = torch.cuda.current_stream().cuda_stream
    logger.info(f'Loading engine from {serialize_path}')
    with open(serialize_path, 'rb') as f:
        engine_buffer = f.read()
    logger.info(f'Creating session from engine')
    session = Session.from_serialized_engine(engine_buffer)
    if args.debug: session._print_engine_info()

    #NOTE: prepare input
    with temporary_datasets_config(HF_DATASETS_OFFLINE=False):
        test_inputs = prepare_text_inputs(model_name)
    hf_tokenizer = AutoTokenizer.from_pretrained(args.hf_model_dir)
    if args.remove_input_padding:
        #NOTE:Remove padding
        inputs_without_padding = hf_tokenizer(**test_inputs)
        input_ids_list = [
            torch.tensor(ids).int().cuda() \
            for ids in inputs_without_padding['input_ids']
            ]
        # attention_mask_list = inputs_without_padding['attention_mask'],
        if is_roberta:
            token_type_ids_list = [
                torch.zeros_like(torch.tensor(ids)).int().cuda() \
                for ids in inputs_without_padding['input_ids']
            ]
        else:
            token_type_ids_list = [
                torch.tensor(ids).int().cuda() \
                for ids in inputs_without_padding['token_type_ids']
                ]

        input_ids, input_lengths, token_type_ids, position_ids, max_input_length = \
            process_input(input_ids_list=input_ids_list,
                          token_type_ids_list=token_type_ids_list,
                          is_roberta=is_roberta,
                          padding_idx=config['pretrained_config']['pad_token_id'])

    else:

        #NOTE:Padding: pad to longest seq len
        inputs_with_padding = hf_tokenizer(
            **test_inputs,
            padding=True,
        )

        inputs_without_padding = hf_tokenizer(**test_inputs)
        input_ids = torch.tensor(inputs_with_padding['input_ids']).int().cuda()
        input_lengths = [len(x) for x in inputs_without_padding['input_ids']]
        input_lengths = torch.tensor(input_lengths,
                                     device=input_ids.device,
                                     dtype=torch.int32)
        attention_mask = torch.tensor(inputs_with_padding['attention_mask'],
                                      device=input_ids.device,
                                      dtype=torch.int32)
        if is_roberta:
            token_type_ids = torch.zeros_like(torch.tensor(
                inputs_with_padding['input_ids']),
                                              device=input_ids.device,
                                              dtype=torch.int32)
        else:
            token_type_ids = torch.tensor(inputs_with_padding['token_type_ids'],
                                          device=input_ids.device,
                                          dtype=torch.int32)

    # NOTE: TRT-LLM perform inference
    output_name = OUTPUT_NAME_MAPPING[model_name]
    if args.remove_input_padding:
        # NOTE: Remove padding:
        inputs = {
            "input_ids": input_ids,
            "input_lengths": input_lengths,
            "token_type_ids": token_type_ids,
            "position_ids": position_ids,
            "max_input_length": max_input_length
        }
        output_info = session.infer_shapes([
            TensorInfo("input_ids", trt.DataType.INT32, input_ids.shape),
            TensorInfo("input_lengths", trt.DataType.INT32,
                       input_lengths.shape),
            TensorInfo("token_type_ids", trt.DataType.INT32,
                       token_type_ids.shape),
            TensorInfo("position_ids", trt.DataType.INT32, position_ids.shape),
            TensorInfo("max_input_length", trt.DataType.INT32,
                       max_input_length.shape)
        ])
    else:
        #NOTE: Padding:
        inputs = {
            'input_ids': input_ids,
            'input_lengths': input_lengths,
            'token_type_ids': token_type_ids,
        }
        output_info = session.infer_shapes([
            TensorInfo('input_ids', trt.DataType.INT32, input_ids.shape),
            TensorInfo('input_lengths', trt.DataType.INT32,
                       input_lengths.shape),
            TensorInfo('token_type_ids', trt.DataType.INT32,
                       token_type_ids.shape)
        ])

    outputs = {
        t.name:
        torch.empty(tuple(t.shape),
                    dtype=trt_dtype_to_torch(t.dtype),
                    device='cuda')
        for t in output_info
    }
    assert output_name in outputs, f'{output_name} not found in outputs, check if build.py set output name correctly'

    logger.info(f"Rank{runtime_rank} is running inference...")
    ok = session.run(inputs=inputs, outputs=outputs, stream=stream)
    assert ok, "Runtime execution failed"
    torch.cuda.synchronize()
    res = outputs[output_name]
    if args.debug: logger.info(f"Outputs:{outputs.keys()}")

    # NOTE: load hf model and perform inference as reference (only on rank0)
    if tensorrt_llm.mpi_rank() == 0:
        logger.info(f"Rank{runtime_rank} is generating HF reference...")
        if args.run_hf_test:
            hf_bert = globals()[f'{model_name}'].from_pretrained(
                args.hf_model_dir).cuda().to(torch.float16).eval()
            hf_inputs = hf_tokenizer(**test_inputs,
                                     padding=True,
                                     return_tensors="pt")
            hf_inputs = hf_inputs.to(hf_bert.device)
            with torch.no_grad():
                hf_outputs = hf_bert.forward(output_hidden_states=args.debug,
                                             **hf_inputs)

            torch.cuda.synchronize()
    # NOTE: Decode output (only on rank0)

    if tensorrt_llm.mpi_rank() == 0:
        logger.info(f"Rank{runtime_rank} is comparing with HF reference...")
        if model_name == "BertModel" or model_name == "RobertaModel":
            if args.remove_input_padding:
                # reshape result back to [batch_size, ...],
                # and then "padding" so we could compare the tensor
                from torch.nn.utils.rnn import pad_sequence
                res = torch.split(res, input_lengths.tolist(), dim=0)
                res = pad_sequence(list(res), batch_first=True, padding_value=0)
            else:
                # applied attention mask on trtllm res
                attention_mask_tmp = attention_mask.unsqueeze(-1)
                res = res * attention_mask_tmp
            if args.run_hf_test:
                ref = hf_outputs.last_hidden_state
                ref = ref * hf_inputs['attention_mask'].unsqueeze(-1)

                if args.debug:
                    intermediate_check(outputs, hf_outputs['hidden_states'],
                                       attention_mask_tmp, logger)

                if world_size == 1:
                    torch.testing.assert_close(actual=res.half(),
                                               expected=ref,
                                               rtol=1.5e-2,
                                               atol=1.5e-2)
                else:
                    # the arithmetic order of TP>1 is different from HF ref, which is always TP=1 for convenience.
                    torch.testing.assert_close(actual=res.half(),
                                               expected=ref,
                                               rtol=4e-2,
                                               atol=2e-2)
                print(f"{model_name} result is all close to HF reference!")

        if model_name == 'BertForQuestionAnswering' or model_name == 'RobertaForQuestionAnswering':
            if args.remove_input_padding:

                # [num_tokens, 2] -> [num_tokens, 1]
                res_start_logits, res_end_logits = torch.split(res, 1, -1)

                # reshape result back to [batch_size, ...]
                res_start_logits = torch.split(res_start_logits,
                                               input_lengths.tolist(),
                                               dim=0)
                res_start_logits = tuple(t.squeeze() for t in res_start_logits)
                res_end_logits = torch.split(res_end_logits,
                                             input_lengths.tolist(),
                                             dim=0)
                res_end_logits = tuple(t.squeeze() for t in res_end_logits)

            else:
                #NOTE: Padding
                # [B, Padding_len, 2] -> [B, Padding_len, 1]
                res_start_logits, res_end_logits = torch.split(res, 1, -1)
                # [B, Padding_len, 1] -> [B, Padding_len]
                res_start_logits = res_start_logits.squeeze()
                res_end_logits = res_end_logits.squeeze()
                res_start_logits = res_start_logits * attention_mask
                res_end_logits = res_end_logits * attention_mask

                res_start_logits = torch.split(res_start_logits, 1, dim=0)
                res_start_logits = tuple(t.squeeze(0) for t in res_start_logits)
                res_end_logits = torch.split(res_end_logits, 1, dim=0)
                res_end_logits = tuple(t.squeeze(0) for t in res_end_logits)


            decode_res = decode_bertqa_output( inputs_text=test_inputs, \
                                hf_tokenizer=hf_tokenizer, start_logits=res_start_logits, \
                                end_logits=res_end_logits)

            if args.run_hf_test:
                ref_start_logits = hf_outputs.start_logits
                ref_end_logits = hf_outputs.end_logits
                # when we use_plugin and have real-data model_dir and input
                # We do not need to care about the output of padding positions:
                ref_start_logits = ref_start_logits * hf_inputs['attention_mask']
                ref_end_logits = ref_end_logits * hf_inputs['attention_mask']

                decode_ref = decode_bertqa_output( inputs_text=test_inputs, \
                                    hf_tokenizer=hf_tokenizer, start_logits=ref_start_logits, \
                                    end_logits=ref_end_logits)
                compare_bertqa_result(inputs_text=test_inputs,
                                      res_answers=decode_res,
                                      ref_answers=decode_ref)

        elif model_name == 'BertForSequenceClassification' or model_name == 'RobertaForSequenceClassification':
            hf_config = AutoConfig.from_pretrained(args.hf_model_dir)
            decode_res = decode_bertcls_output(logits=res,
                                               hf_model_config=hf_config,
                                               inputs_text=test_inputs)

            if args.run_hf_test:
                ref = hf_outputs.logits
                if world_size == 1:
                    torch.testing.assert_close(actual=res.half(),
                                               expected=ref,
                                               rtol=1.5e-2,
                                               atol=1.5e-2)
                else:
                    # the arithmetic order of TP>1 is different from HF ref, which is always TP=1 for convenience.
                    torch.testing.assert_close(actual=res.half(),
                                               expected=ref,
                                               rtol=4e-2,
                                               atol=2e-2)
                decode_ref = decode_bertcls_output(logits=hf_outputs.logits,
                                                   hf_model_config=hf_config,
                                                   inputs_text=test_inputs)
                compare_bertcls_result(inputs_text=test_inputs,
                                       res_answers=decode_res,
                                       ref_answers=decode_res)