TensorRT-LLMs/tests/_torch/multi_gpu/test_model.py

import os
import sys
from difflib import SequenceMatcher

import pytest
import torch

from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
from tensorrt_llm.llmapi.llm_utils import CalibConfig
from tensorrt_llm.llmapi.utils import get_total_gpu_memory
from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig

sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from utils.llm_data import llm_models_root
from utils.util import getSMVersion

MAX_SEQ_LEN = 2048


def similar(a, b, threshold=0.9):
    "similar compare a and b "
    return SequenceMatcher(None, a, b).ratio() >= threshold


@pytest.mark.parametrize("model_name", ["llama-3.1-model/Meta-Llama-3.1-8B"],
                         ids=["llama-3.1-8b"])
@pytest.mark.parametrize("quant", ["bf16", "fp8", "fp8_kv_cache"])
@pytest.mark.parametrize("tp_size", [1, 4], ids=["tp1", "tp4"])
@pytest.mark.parametrize("torch_compile", [True, False],
                         ids=["torch_compile", "eager"])
def test_model(model_name, quant, tp_size, torch_compile):
    quant_configs = {
        "bf16":
        QuantConfig(),
        "fp8":
        QuantConfig(quant_algo=QuantAlgo.FP8),
        "fp8_kv_cache":
        QuantConfig(
            quant_algo=QuantAlgo.FP8,
            kv_cache_quant_algo=QuantAlgo.FP8,
        ),
    }
    quant_config = quant_configs[quant]
    is_fp8 = quant_config.quant_algo == QuantAlgo.FP8
    is_fp8_kv_cache = quant_config.kv_cache_quant_algo == QuantAlgo.FP8
    if torch.cuda.device_count() < tp_size:
        pytest.skip(f"Not enough GPUs available, need {tp_size} "
                    f"but only have {torch.cuda.device_count()}")
    if is_fp8 and getSMVersion() < 90:
        pytest.skip(f"FP8 is not supported in this SM version {getSMVersion()}")
    # 8GB weight + 8GB KV cache + 8GB cache_indirection (TRT engine only) = 24GB
    if is_fp8 and get_total_gpu_memory(0) < 24 * 1024**3:
        pytest.skip("Not enough GPU memory to run FP8 model")
    # 16GB weight + 8GB KV cache + 8GB cache_indirection (TRT engine only) = 32GB
    if not is_fp8 and get_total_gpu_memory(0) < 32 * 1024**3:
        pytest.skip("Not enough GPU memory to run BF16 model")

    prompts = [
        "The president of the United States is",
    ]

    expected_outputs = [
        " the head of state and head of government of the",
    ]

    pytorch_config = PyTorchConfig(
        torch_compile_enabled=torch_compile,
        cuda_graph_padding_enabled=torch_compile,
        cuda_graph_batch_sizes=[4],
    )
    if is_fp8_kv_cache:
        pytorch_config.kv_cache_dtype = "fp8"

    model_dir = str(llm_models_root() / model_name)
    if is_fp8:
        fp8_model_names = {
            "llama-3.1-model/Meta-Llama-3.1-8B":
            "llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
        }
        model_dir = str(llm_models_root() / fp8_model_names[model_name])

    llm = LLM(
        model=model_dir,
        tensor_parallel_size=tp_size,
        quant_config=quant_config,
        pytorch_backend_config=pytorch_config,
        calib_config=CalibConfig(calib_dataset=str(llm_models_root() /
                                                   "datasets/cnn_dailymail")),
    )
    with llm:
        outputs = llm.generate(
            prompts,
            sampling_params=SamplingParams(max_tokens=10),
        )

    assert len(outputs) == len(expected_outputs), "Output length mismatch"
    for output, expected in zip(outputs, expected_outputs):
        output_text = output.outputs[0].text
        print(output_text)
        print(output.outputs[0].token_ids)
        assert similar(
            output_text,
            expected), f"Expected '{expected}' but get '{output_text}'"


if __name__ == '__main__':
    test_model("llama-3.1-model/Meta-Llama-3.1-8B",
               "bf16",
               1,
               torch_compile=False)
    test_model("llama-3.1-model/Meta-Llama-3.1-8B",
               "bf16",
               4,
               torch_compile=False)
    test_model("llama-3.1-model/Meta-Llama-3.1-8B",
               "fp8",
               1,
               torch_compile=False)
    test_model("llama-3.1-model/Meta-Llama-3.1-8B",
               "fp8",
               4,
               torch_compile=False)
    test_model("llama-3.1-model/Meta-Llama-3.1-8B",
               "fp8_kv_cache",
               1,
               torch_compile=False)
    test_model("llama-3.1-model/Meta-Llama-3.1-8B",
               "fp8_kv_cache",
               4,
               torch_compile=False)