TensorRT-LLMs/cpp/tests/resources/scripts/generate_expected_chatglm_output.py

#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path

import numpy as np
import run
import torch


def generate_output(
    model_name: str = "",
    engine_kind: str = "fp16-plugin",
    num_batchs: int = 1,
    num_beams: int = 1,
    max_output_len: int = 512,
    output_logits: bool = False,
):

    examples_chatglm_dir = Path(
        __file__).parent.parent.parent.parent.parent / "examples/chatglm"
    resources_dir = Path(__file__).parent.parent.resolve()

    engine_dir = resources_dir / 'models' / 'rt_engine' / model_name
    '''
    # we do not distinguish TP / PP / engine_kind yet
    tp_size = 1
    pp_size = 1
    tp_pp_dir = 'tp' + str(tp_size) + '-pp' + str(pp_size) + '-gpu/'
    engine_dir = engine_dir / engine_kind / tp_pp_dir
    '''
    data_output_dir = resources_dir / 'data' / model_name
    data_output_dir.mkdir(exist_ok=True, parents=True)
    data_input_file_name = f"inputId-BS{num_batchs}-BM{num_beams}.npy"
    data_output_file_name = f"outputId-BS{num_batchs}-BM{num_beams}.npy"
    input_text = [
        "Born in north-east France, Soyer trained as a",
        "Jen-Hsun Huang was born in Tainan, Taiwan, in 1963. His family",
    ]

    if num_batchs <= 2:
        input_text = input_text[:num_batchs]
    else:
        input_text = input_text + input_text[-1] * (num_batchs - 2)

    args = run.parse_arguments([
        '--engine_dir',
        str(engine_dir),
        '--tokenizer_dir',
        str(examples_chatglm_dir / model_name),
        '--input_text',
        *input_text,
        '--output_npy',
        str(data_output_dir / data_output_file_name),
        '--max_output_len',
        str(max_output_len),
        '--num_beams',
        str(num_beams),
    ])

    # Since main in run.py does not save input_ids, we save it manually
    model_name, model_version = run.read_model_name(args.engine_dir)
    tokenizer, pad_id, end_id = run.load_tokenizer(
        tokenizer_dir=args.tokenizer_dir,
        model_name=model_name,
        model_version=model_version,
    )
    batch_input_ids = run.parse_input(
        tokenizer,
        input_text=input_text,
        prompt_template=None,
        input_file=None,
        add_special_tokens=True,
        max_input_length=512,
        pad_id=pad_id,
        num_prepend_vtokens=[],
        model_name=model_name,
        model_version=model_version,
    )
    input_len = [x.size(0) for x in batch_input_ids]
    max_input_len = max(input_len)

    batch_input_ids_padding = torch.zeros([num_batchs, max_input_len],
                                          dtype=torch.int32) + end_id

    for i, sample in enumerate(batch_input_ids):
        # padding to left
        batch_input_ids_padding[i, :len(sample)] = sample
        """
        # padding to right
        nPadding = 0
        for token in sample:
            if token == pad_id:
                nPadding += 1
            else:
                break
        batch_input_ids_padding[i, :len(sample[nPadding:])] = sample[nPadding:]
        """
    batch_input_ids = batch_input_ids_padding

    np.save(data_output_dir / data_input_file_name,
            batch_input_ids.detach().cpu().numpy())

    run.main(args)

    output_data = np.load(args.output_npy)
    np.save(args.output_npy, output_data.reshape(num_batchs, num_beams, -1))


if __name__ == '__main__':

    generate_output(model_name='chatglm_6b', num_batchs=1, num_beams=1)

    generate_output(model_name='chatglm2_6b', num_batchs=1, num_beams=1)
    generate_output(model_name='chatglm2_6b', num_batchs=2, num_beams=1)
    generate_output(model_name='chatglm2_6b', num_batchs=1, num_beams=2)

    generate_output(model_name='chatglm3_6b', num_batchs=1, num_beams=1)
    generate_output(model_name='chatglm3_6b', num_batchs=2, num_beams=1)
    generate_output(model_name='chatglm3_6b', num_batchs=1, num_beams=2)

    print("Done")