TensorRT-LLMs/tests/integration/defs/llmapi/test_llm_e2e.py

# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import tempfile
from pathlib import Path

import pytest
from defs.common import convert_weights, venv_check_call
from defs.conftest import llm_models_root, unittest_path
from defs.trt_test_alternative import check_call

from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi.llm_utils import BuildConfig


def test_llmapi_quant_llama_70b(llm_root, engine_dir, llm_venv):
    # Test quantizing llama-70b model with only 2 H100 GPUs
    # The background: there is a bug preventing quantization of llama-70b model with <tp-size> GPUs
    visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "").split(',')
    if len(visible_devices) < 2:
        visible_devices = ['0', '1']
    visible_devices = visible_devices[:2]

    env = {
        'CUDA_VISIBLE_DEVICES': ','.join(visible_devices),
    }
    print(f'env: {env}')

    script_path = Path(
        llm_root
    ) / "tests/integration/defs/examples/run_llm_fp8_quant_llama_70b.py"
    llm_venv.run_cmd([str(script_path)], env=env)


run_llm_path = os.path.join(os.path.dirname(__file__), "_run_llmapi_llm.py")


@pytest.mark.parametrize("model_name,model_path", [
    ("llama", "llama-models-v2/llama-v2-7b-hf"),
])
def test_llmapi_load_engine_from_build_command_with_lora(
        llm_root, llm_venv, engine_dir, model_name, model_path):
    llama_example_root = os.path.join(llm_root, "examples", "models", "core",
                                      model_name)
    dtype = 'bfloat16'
    cmodel_dir = os.path.join(engine_dir, f"{model_name}-engine")

    ckpt_dir = convert_weights(llm_venv=llm_venv,
                               example_root=llama_example_root,
                               cmodel_dir=cmodel_dir,
                               model=model_name,
                               model_path=f'{llm_models_root()}/{model_path}',
                               data_type=dtype)

    engine_dir = os.path.join(engine_dir, f"{model_name}-engine")

    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={ckpt_dir}",
        f"--output_dir={engine_dir}",
        f"--max_batch_size={1}",
        f"--max_input_len={924}",
        f"--max_seq_len={1024}",
        f"--max_beam_width={1}",
        f"--gemm_plugin={dtype}",
        f"--gpt_attention_plugin={dtype}",
        f"--lora_plugin={dtype}",
        f"--lora_target_modules=attn_q",
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    venv_check_call(llm_venv, [
        run_llm_path,
        "--model_dir",
        engine_dir,
    ])


@pytest.mark.skip(reason="https://nvbugs/5574355")
@pytest.mark.parametrize("model_name,model_path", [
    ("llama", "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"),
])
def test_llmapi_build_command_parameters_align(llm_root, llm_venv, engine_dir,
                                               model_name, model_path):
    llama_example_root = os.path.join(llm_root, "examples", model_name)
    dtype = 'float16'
    cmodel_dir = os.path.join(engine_dir, f"{model_name}-engine")

    ckpt_dir = convert_weights(llm_venv=llm_venv,
                               example_root=llama_example_root,
                               cmodel_dir=cmodel_dir,
                               model=model_name,
                               model_path=f'{llm_models_root()}/{model_path}',
                               data_type=dtype)

    engine_dir = os.path.join(engine_dir, f"{model_name}-engine")

    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={ckpt_dir}",
        f"--output_dir={engine_dir}",
        f"--max_batch_size={4}",
        f"--max_input_len={111}",
        f"--max_seq_len={312}",
        f"--max_beam_width={4}",
        f"--gemm_plugin={dtype}",
        f"--gpt_attention_plugin={dtype}",
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    build_config = BuildConfig()
    # change some building parameters
    build_config.max_batch_size = 4
    build_config.max_beam_width = 4
    build_config.max_input_len = 111
    build_config.strongly_typed = True
    build_config.max_seq_len = 312
    build_config.plugin_config._gemm_plugin = dtype
    build_config.plugin_config._gpt_attention_plugin = dtype

    llm = LLM(model=f'{llm_models_root()}/{model_path}',
              build_config=build_config)
    tmpdir = tempfile.TemporaryDirectory()
    llm.save(tmpdir.name)
    build_cmd_cfg = None
    build_llmapi_cfg = None

    with open(os.path.join(engine_dir, "config.json"), "r") as f:
        engine_config = json.load(f)

        build_cmd_cfg = BuildConfig(
            **engine_config["build_config"]).model_dump()

    with open(os.path.join(tmpdir.name, "config.json"), "r") as f:
        llm_api_engine_cfg = json.load(f)

        build_llmapi_cfg = BuildConfig(
            **llm_api_engine_cfg["build_config"]).model_dump()

    assert build_cmd_cfg == build_llmapi_cfg


def test_llmapi_load_ckpt_from_convert_command(llm_root, llm_venv, engine_dir):
    llama_example_root = os.path.join(llm_root, "examples", "models", "core",
                                      "llama")
    dtype = 'float16'
    cmodel_dir = os.path.join(engine_dir, "llama-7b-cmodel")

    ckpt_dir = convert_weights(
        llm_venv=llm_venv,
        example_root=llama_example_root,
        cmodel_dir=cmodel_dir,
        model='llama-7b',
        model_path=f'{llm_models_root()}/llama-models/llama-7b-hf',
        data_type=dtype)

    venv_check_call(llm_venv, [
        run_llm_path,
        "--model_dir",
        ckpt_dir,
    ])


def test_llmapi_exit(llm_venv):
    llm_exit_script = unittest_path() / "llmapi/run_llm_exit.py"
    llama_model_dir = Path(
        llm_models_root()) / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"

    run_command = [
        str(llm_exit_script), "--model_dir",
        str(llama_model_dir), "--tp_size", "1"
    ]
    venv_check_call(llm_venv, run_command)


@pytest.mark.skip_less_device(2)
def test_llmapi_exit_multi_gpu(llm_venv):
    llm_exit_script = unittest_path() / "llmapi/run_llm_exit.py"
    llama_model_dir = Path(
        llm_models_root()) / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"

    run_command = [
        str(llm_exit_script), "--model_dir",
        str(llama_model_dir), "--tp_size", "2"
    ]
    venv_check_call(llm_venv, run_command)


@pytest.mark.parametrize("model_name,model_path", [
    ("llama", "llama-models/llama-7b-hf"),
    ("llama", "codellama/CodeLlama-7b-Instruct-hf"),
])
def test_llmapi_load_engine_from_build_command(llm_root, llm_venv, engine_dir,
                                               model_name, model_path):
    llama_example_root = os.path.join(llm_root, "examples", "models", "core",
                                      model_name)
    dtype = 'float16'
    cmodel_dir = os.path.join(engine_dir, f"{model_name}-engine")

    ckpt_dir = convert_weights(llm_venv=llm_venv,
                               example_root=llama_example_root,
                               cmodel_dir=cmodel_dir,
                               model=model_name,
                               model_path=f'{llm_models_root()}/{model_path}',
                               data_type=dtype)

    engine_dir = os.path.join(engine_dir, f"{model_name}-engine")

    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={ckpt_dir}",
        f"--output_dir={engine_dir}",
        f"--max_batch_size={8}",
        f"--max_input_len={924}",
        f"--max_seq_len={1024}",
        f"--max_beam_width={1}",
        f"--gemm_plugin={dtype}",
        f"--gpt_attention_plugin={dtype}",
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    venv_check_call(llm_venv, [
        run_llm_path,
        "--model_dir",
        engine_dir,
    ])