TensorRT-LLMs/tests/model_api/test_model_quantization.py

import os
import sys
import tempfile

import torch

import tensorrt_llm
from tensorrt_llm.models import LLaMAForCausalLM
from tensorrt_llm.quantization.mode import QuantMode

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from utils.llm_data import llm_models_root


def ammo_installed():
    try:
        # isort: off
        import ammo.torch.quantization as atq
        from ammo.torch.export import export_model_config
        print(type(atq))
        print(type(export_model_config))
        # isort: on
        return True
    except Exception:
        return False
    return False


tensorrt_llm.logger.set_level('info')


def test_int4_awq_quantization():
    input_text = [
        'Born in north-east France, Soyer trained as a',
        "What is large language model?"
    ]
    awq_expected_output = [
        "chef in his native country, before moving to London",
        "\nLarge language model is a model that is"
    ]
    if not ammo_installed():
        print("Test skipped due to ammo not installed")
        return

    major, minor = torch.cuda.get_device_capability()
    if not (major >= 8):
        print("Test supported on post Ampere")
        return
    max_batch_size, max_isl, max_osl = 8, 256, 256
    hf_model_dir = llm_models_root() / "llama-models/llama-7b-hf"
    tokenizer_dir = hf_model_dir

    quant_mode_int4_awq = QuantMode.from_description(quantize_weights=True,
                                                     quantize_activations=False,
                                                     per_token=False,
                                                     per_channel=False,
                                                     per_group=True,
                                                     use_int4_weights=True)

    hf_model_dir = llm_models_root() / "llama-models/llama-7b-hf"
    llama = LLaMAForCausalLM.from_hugging_face(hf_model_dir,
                                               'float16',
                                               quant_mode=quant_mode_int4_awq,
                                               quantize_lm_head=True)
    llama.to_trt(max_batch_size, max_isl, max_osl)
    engine_dir = "llama-awq-quantized"
    engine_temp = tempfile.TemporaryDirectory(engine_dir)
    engine_dir = engine_temp.name
    for idx, (inp, output) in enumerate(
            llama._generate(input_text, 10, tokenizer_dir=tokenizer_dir)):
        print(f"Input: {inp}")
        print(f'Output: {output}')
        assert output == awq_expected_output[
            idx], f"Expecting {awq_expected_output[idx]}, got {output}"
    # llama.save(engine_dir)


def test_fp8_quantization():
    input_text = [
        'Born in north-east France, Soyer trained as a',
        "What is large language model?"
    ]
    fp8_expected_output = [
        "chef in Paris and London before opening his first restaurant",
        "\nLarge language model is a model that is"
    ]
    major, minor = torch.cuda.get_device_capability()
    if not ammo_installed():
        print("Test skipped due to ammo not installed")
        return
    if not (f"{major}.{minor}" == "8.9" or major >= 9):
        print("Test skipped fp8 only supported on Ada and post Hopper")
        return
    max_batch_size, max_isl, max_osl = 8, 256, 256
    hf_model_dir = llm_models_root() / "llama-models/llama-7b-hf"
    tokenizer_dir = hf_model_dir

    quant_mode = QuantMode(0)
    quant_mode = quant_mode.set_fp8_qdq()
    quant_mode = quant_mode.set_fp8_kv_cache()

    hf_model_dir = llm_models_root() / "llama-models/llama-7b-hf"
    llama = LLaMAForCausalLM.from_hugging_face(hf_model_dir,
                                               'float16',
                                               quant_mode=quant_mode)
    llama.to_trt(max_batch_size, max_isl, max_osl)
    engine_dir = "llama-fp8-quantized"
    engine_temp = tempfile.TemporaryDirectory(engine_dir)
    engine_dir = engine_temp.name
    for idx, (inp, output) in enumerate(
            llama._generate(input_text, 10, tokenizer_dir=tokenizer_dir)):
        print(f"Input: {inp}")
        print(f'Output: {output}')
        assert output == fp8_expected_output[
            idx], f"Expecting {fp8_expected_output[idx]}, got {output}"
    # llama.save(engine_dir)


if __name__ == "__main__":
    test_int4_awq_quantization()
    test_fp8_quantization()