mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* Update TensorRT-LLM --------- Co-authored-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
122 lines
4.3 KiB
Python
122 lines
4.3 KiB
Python
import os
|
|
import sys
|
|
import tempfile
|
|
|
|
import torch
|
|
|
|
import tensorrt_llm
|
|
from tensorrt_llm.models import LLaMAForCausalLM
|
|
from tensorrt_llm.quantization.mode import QuantMode
|
|
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
|
from utils.llm_data import llm_models_root
|
|
|
|
|
|
def ammo_installed():
|
|
try:
|
|
# isort: off
|
|
import ammo.torch.quantization as atq
|
|
from ammo.torch.export import export_model_config
|
|
print(type(atq))
|
|
print(type(export_model_config))
|
|
# isort: on
|
|
return True
|
|
except Exception:
|
|
return False
|
|
return False
|
|
|
|
|
|
tensorrt_llm.logger.set_level('info')
|
|
|
|
|
|
def test_int4_awq_quantization():
|
|
input_text = [
|
|
'Born in north-east France, Soyer trained as a',
|
|
"What is large language model?"
|
|
]
|
|
awq_expected_output = [
|
|
"chef in his native country, before moving to London",
|
|
"\nLarge language model is a model that is"
|
|
]
|
|
if not ammo_installed():
|
|
print("Test skipped due to ammo not installed")
|
|
return
|
|
|
|
major, minor = torch.cuda.get_device_capability()
|
|
if not (major >= 8):
|
|
print("Test supported on post Ampere")
|
|
return
|
|
max_batch_size, max_isl, max_osl = 8, 256, 256
|
|
hf_model_dir = llm_models_root() / "llama-models/llama-7b-hf"
|
|
tokenizer_dir = hf_model_dir
|
|
|
|
quant_mode_int4_awq = QuantMode.from_description(quantize_weights=True,
|
|
quantize_activations=False,
|
|
per_token=False,
|
|
per_channel=False,
|
|
per_group=True,
|
|
use_int4_weights=True)
|
|
|
|
hf_model_dir = llm_models_root() / "llama-models/llama-7b-hf"
|
|
llama = LLaMAForCausalLM.from_hugging_face(hf_model_dir,
|
|
'float16',
|
|
quant_mode=quant_mode_int4_awq,
|
|
quantize_lm_head=True)
|
|
llama.to_trt(max_batch_size, max_isl, max_osl)
|
|
engine_dir = "llama-awq-quantized"
|
|
engine_temp = tempfile.TemporaryDirectory(engine_dir)
|
|
engine_dir = engine_temp.name
|
|
for idx, (inp, output) in enumerate(
|
|
llama._generate(input_text, 10, tokenizer_dir=tokenizer_dir)):
|
|
print(f"Input: {inp}")
|
|
print(f'Output: {output}')
|
|
assert output == awq_expected_output[
|
|
idx], f"Expecting {awq_expected_output[idx]}, got {output}"
|
|
# llama.save(engine_dir)
|
|
|
|
|
|
def test_fp8_quantization():
|
|
input_text = [
|
|
'Born in north-east France, Soyer trained as a',
|
|
"What is large language model?"
|
|
]
|
|
fp8_expected_output = [
|
|
"chef in Paris and London before opening his first restaurant",
|
|
"\nLarge language model is a model that is"
|
|
]
|
|
major, minor = torch.cuda.get_device_capability()
|
|
if not ammo_installed():
|
|
print("Test skipped due to ammo not installed")
|
|
return
|
|
if not (f"{major}.{minor}" == "8.9" or major >= 9):
|
|
print("Test skipped fp8 only supported on Ada and post Hopper")
|
|
return
|
|
max_batch_size, max_isl, max_osl = 8, 256, 256
|
|
hf_model_dir = llm_models_root() / "llama-models/llama-7b-hf"
|
|
tokenizer_dir = hf_model_dir
|
|
|
|
quant_mode = QuantMode(0)
|
|
quant_mode = quant_mode.set_fp8_qdq()
|
|
quant_mode = quant_mode.set_fp8_kv_cache()
|
|
|
|
hf_model_dir = llm_models_root() / "llama-models/llama-7b-hf"
|
|
llama = LLaMAForCausalLM.from_hugging_face(hf_model_dir,
|
|
'float16',
|
|
quant_mode=quant_mode)
|
|
llama.to_trt(max_batch_size, max_isl, max_osl)
|
|
engine_dir = "llama-fp8-quantized"
|
|
engine_temp = tempfile.TemporaryDirectory(engine_dir)
|
|
engine_dir = engine_temp.name
|
|
for idx, (inp, output) in enumerate(
|
|
llama._generate(input_text, 10, tokenizer_dir=tokenizer_dir)):
|
|
print(f"Input: {inp}")
|
|
print(f'Output: {output}')
|
|
assert output == fp8_expected_output[
|
|
idx], f"Expecting {fp8_expected_output[idx]}, got {output}"
|
|
# llama.save(engine_dir)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_int4_awq_quantization()
|
|
test_fp8_quantization()
|