TensorRT-LLMs/tests/integration/defs/examples/test_llama.py
2026-01-09 09:36:23 -05:00

4046 lines
157 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import csv
import json
import os
import re
import shutil
import defs.ci_profiler
import pytest
from defs.common import (convert_weights, generate_summary_cmd, parse_output,
quantize_data, similar,
test_llm_torch_multi_lora_support,
test_multi_lora_support, venv_check_call,
venv_check_output, venv_mpi_check_call)
# yapf: disable
from defs.conftest import (get_device_count, get_device_memory,
get_host_total_memory, get_sm_version,
skip_fp8_pre_ada, skip_no_nvls, skip_post_blackwell,
skip_post_blackwell_ultra, skip_pre_ada,
skip_pre_blackwell)
# yapf: enable
from defs.trt_test_alternative import check_call, exists
# skip trt flow cases on post-Blackwell-Ultra
# if get_sm_version() >= 103:
# pytest.skip(
# "TRT workflow tests are not supported on post Blackwell-Ultra architecture",
# allow_module_level=True)
INPUT_TEXT_1 = "After Washington had returned to Williamsburg, " + \
"Dinwiddie ordered him to lead a larger force to assist Trent in his work. " + \
"While en route, Washington learned of Trent's retreat. " + \
"Since Tanaghrisson had promised support to the British, " + \
"Washington continued toward Fort Duquesne and met with the Mingo leader. " + \
"Learning of a French scouting party in the area, Washington, " + \
"with Tanaghrisson and his party, surprised the Canadians on May 28 " + \
"in what became known as the Battle of Jumonville Glen. " + \
"They killed many of the Canadians, including their commanding officer, " + \
"Joseph Coulon de Jumonville, whose head was reportedly split open by " + \
"Tanaghrisson with a tomahawk. The historian Fred Anderson suggests that " + \
"Tanaghrisson was acting to gain the support of the British and regain " + \
"authority over his own people. They had been inclined to support the French, " + \
"with whom they had long trading relationships. One of Tanaghrisson's men told " + \
"Contrecoeur that Jumonville had been killed by British musket fire. " + \
"Question: Upon learning of a French scounting party in the area, " + \
"what did Washington do? Answer:"
INPUT_TEXT_2 = "Born in north-east France, Soyer trained as a"
@pytest.mark.parametrize("num_beams", [5, 7],
ids=["num_beams_4", "num_beams_7"])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_early_finish_beams(llama_example_root, llm_venv, llama_model_root,
engine_dir, cmodel_dir, num_beams):
""" Test the correctness of beam search + streaming versus the outputs of
non-streaming beam search. Both use the cpp runtime.
This test is aimed specifically at checking if shorter finished beams are being put
into the outputs correctly."""
dtype = 'float16'
output_len = 10
input_text = ["want to", "The time is", "Soyer was"]
model_name = os.path.basename(llama_model_root)
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=dtype)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
f"--gemm_plugin={dtype}",
f"--max_beam_width={num_beams}",
"--context_fmha=enable",
"--use_paged_context_fmha=enable",
"--paged_kv_cache=enable",
"--remove_input_padding=enable",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Running inference...")
streaming_command = [
f"{llama_example_root}/../run.py", f"--max_output_len={output_len}",
f"--engine_dir={engine_dir}", f"--tokenizer_dir={llama_model_root}",
f"--streaming", f"--streaming_interval=1", f"--num_beams={num_beams}",
f"--input_text", *input_text
]
streaming_outputs = venv_check_output(llm_venv, streaming_command)
joined_nonstreamed_outputs = ""
for length_iterator in range(1, output_len + 1):
command = [
f"{llama_example_root}/../run.py",
f"--max_output_len={length_iterator}", f"--engine_dir={engine_dir}",
f"--tokenizer_dir={llama_model_root}", f"--num_beams={num_beams}",
f"--input_text", *input_text
]
non_streaming_output = venv_check_output(llm_venv, command)
joined_nonstreamed_outputs += "Output from command" + str(
command) + "\n" + non_streaming_output
def parse_output(text: str) -> list[str]:
results = []
while True:
match = re.search(
r"Output \[Text \d+ Beam \d+\]: \"([^\"]*)\"\r?\n", text)
if match is None:
break
_, end = match.span()
results.append(match.group(1))
text = text[end:]
return results
print("STREAMING OUTPUT HERE\n\n\n",
streaming_outputs,
"\n\n\n",
sep="----")
print("NON-STREAMING OUTPUT HERE\n\n\n",
joined_nonstreamed_outputs,
"\n\n\n",
sep="----")
parsed_streamed_outputs = parse_output(streaming_outputs)
parsed_nonstreamed_outputs = parse_output(joined_nonstreamed_outputs)
def ordered_subset(s1, s2):
"""
Use this to check if the streamed outputs are an ordered subset of nonstreamed
Streaming can sometimes skip outputs
"""
s2 = iter(s2)
try:
for c in s1:
while next(s2) != c:
pass
else:
return True
except StopIteration:
return False
streaming_is_subset = ordered_subset(parsed_streamed_outputs,
parsed_nonstreamed_outputs)
print("streaming_is_subset ", streaming_is_subset)
assert streaming_is_subset
is_equal = (parsed_streamed_outputs == parsed_nonstreamed_outputs)
print("is_equal", is_equal)
if not is_equal:
print("Differences:")
for streamed, nonstreamed in zip(parsed_streamed_outputs,
parsed_nonstreamed_outputs):
if (streamed != nonstreamed):
print("Streamed:", streamed)
print("Nonstreamed:", nonstreamed)
assert is_equal
@pytest.mark.parametrize("num_beams", [1, 2, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("use_weight_only_groupwise_quant_matmul_plugin",
[True, False],
ids=[
"enable_weight_only_groupwise_quant_matmul_plugin",
"disable_weight_only_groupwise_quant_matmul_plugin"
])
@pytest.mark.parametrize("run_type", ['inference', 'summarization'])
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_v1_1gpu(use_weight_only_groupwise_quant_matmul_plugin,
run_type, data_type, llama_example_root,
llama_model_root, llm_datasets_root, llm_rouge_root,
llm_venv, cmodel_dir, engine_dir, num_beams):
if num_beams > 2 and get_device_memory() < 80000:
pytest.skip("device memory is insufficient.")
model_name = 'llama_v1-{}'.format(run_type)
print("Build engines...")
if not use_weight_only_groupwise_quant_matmul_plugin:
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=data_type)
else:
model_name = 'llama_v1-int4_gptq-{}'.format(run_type)
llama_gptq_safetensors_root = os.path.join(
llama_model_root, "../..", "int4-quantized-gptq-awq",
"llama-7b-4bit-gs128.safetensors")
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=data_type,
quant_ckpt_path=llama_gptq_safetensors_root)
build_cmd = [
"trtllm-build", f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}", f"--gpt_attention_plugin={data_type}",
f"--gemm_plugin={data_type}", "--remove_input_padding=enable",
f"--max_beam_width={num_beams}"
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
if run_type == "inference":
print("Run inference...")
venv_check_call(llm_venv, [
f"{llama_example_root}/../run.py",
"--max_output_len=50",
f"--tokenizer_dir={llama_model_root}",
f"--engine_dir={engine_dir}",
f"--num_beams={num_beams}",
])
elif run_type == "summarization":
print("Run summarize...")
summary_cmd = [
f"{llama_example_root}/../summarize.py", "--test_trt_llm",
"--hf_model_dir", f"{llama_model_root}", "--data_type=fp16",
f"--engine_dir={engine_dir}", "--check_accuracy",
f"--num_beams={num_beams}", f"--dataset_dir={llm_datasets_root}",
f"--rouge_dir={llm_rouge_root}"
]
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_v1_manage_weights_1gpu_summarize(llama_example_root,
llama_model_root,
llm_datasets_root,
llm_rouge_root, llm_venv,
cmodel_dir, engine_dir):
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model="llama_v1-float16",
model_path=llama_model_root,
data_type="float16")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gpt_attention_plugin=float16",
f"--gemm_plugin=disable",
"--remove_input_padding=enable",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
summary_cmd = [
f"{llama_example_root}/../summarize.py", "--test_trt_llm",
"--hf_model_dir", f"{llama_model_root}", "--data_type=fp16",
f"--engine_dir={engine_dir}", "--check_accuracy",
f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
]
venv_check_call(llm_venv, summary_cmd)
@skip_pre_blackwell
@skip_post_blackwell_ultra
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
@pytest.mark.parametrize("fp4_type", ["plugin", "ootb", "disable"],
ids=["fp4_plugin", "fp4_ootb", "disable_fp4"])
@pytest.mark.parametrize("fuse_fp4_quant", ["enable", "disable"],
ids=["enable_fused_quant", "disable_fused_quant"])
@pytest.mark.parametrize(
"norm_quant_fusion", ["enable", "disable"],
ids=["enable_norm_quant_fusion", "disable_norm_quant_fusion"])
@pytest.mark.parametrize(
"llama_model_root",
['llama-v3-8b-instruct-hf', 'llama-3.1-8b', 'llama-3.1-70b-instruct'],
indirect=True)
def test_llm_llama_1gpu_fp4(
mmlu_dataset_root,
data_type,
fp4_type,
fuse_fp4_quant,
norm_quant_fusion,
llama_example_root,
llama_model_root,
llm_venv,
cmodel_dir,
engine_dir,
qcache_dir_without_install_package,
llm_datasets_root,
):
model_name = os.path.basename(llama_model_root)
if fp4_type != "disable":
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype=data_type,
qformat="nvfp4",
kv_cache_dtype="fp8",
quantize_dir=qcache_dir_without_install_package)
else:
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=data_type)
print("Build engines...")
build_cmd = [
"trtllm-build", f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}", "--max_batch_size=32"
]
if fp4_type != "disable":
build_cmd.extend([
"--gemm_plugin=disable"
if fp4_type == "ootb" else "--gemm_plugin=nvfp4"
])
if fp4_type == "plugin" or fuse_fp4_quant == "enable":
build_cmd.extend([
"--use_paged_context_fmha=enable", "--use_fp8_context_fmha=enable"
])
if fuse_fp4_quant == "enable":
build_cmd.extend(["--fuse_fp4_quant=enable"])
if norm_quant_fusion == 'enable':
build_cmd.extend(["--norm_quant_fusion=enable"])
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run MMLU test")
accuracy_map = {
'llama-v3-8b-instruct-hf': 61.5,
'Meta-Llama-3.1-8B': 61.0,
'Meta-Llama-3.1-70B-Instruct': 75
}
acc_thres = accuracy_map[model_name]
mmlu_cmd = [
"trtllm-eval", f"--model={engine_dir}",
f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
f"--dataset_path={mmlu_dataset_root}", "--check_accuracy",
f"--accuracy_threshold={acc_thres}"
]
check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)
@skip_pre_blackwell
@pytest.mark.parametrize("fp4_type", ["plugin", "ootb", "disable"],
ids=["fp4_plugin", "fp4_ootb", "disable_fp4"])
@pytest.mark.parametrize(
"llama_model_root",
['llama-v3-8b-instruct-hf', 'llama-3.1-8b', 'llama-3.1-70b-instruct'],
indirect=True)
def test_llm_llama_1gpu_fp4_model_config(
fp4_type,
llama_example_root,
llama_model_root,
llm_venv,
cmodel_dir,
engine_dir,
qcache_dir_without_install_package,
llm_datasets_root,
):
model_name = os.path.basename(llama_model_root)
if fp4_type != "disable":
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="nvfp4",
kv_cache_dtype="fp8",
quantize_dir=qcache_dir_without_install_package)
else:
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type="float16")
print("Build engines...")
build_cmd = [
"trtllm-build", f"--model_config={model_dir}/config.json",
f"--output_dir={engine_dir}", "--max_batch_size=32"
]
if fp4_type != "disable":
build_cmd.extend([
"--gemm_plugin=disable"
if fp4_type == "ootb" else "--gemm_plugin=nvfp4"
])
if fp4_type == "plugin":
build_cmd.extend([
"--use_paged_context_fmha=enable", "--use_fp8_context_fmha=enable"
])
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
@skip_pre_blackwell
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("fp4_type", ["plugin", "ootb", "disable"],
ids=["fp4_plugin", "fp4_ootb", "disable_fp4"])
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-70b-instruct'],
indirect=True)
def test_llm_llama_2gpu_fp4(mmlu_dataset_root, fp4_type, llama_example_root,
llama_model_root, llm_venv, engine_dir,
qcache_dir_without_install_package,
llm_datasets_root):
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="nvfp4",
tp_size=2,
quantize_dir=qcache_dir_without_install_package)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
"--max_batch_size=32",
]
if fp4_type != "disable":
build_cmd.extend([
"--gemm_plugin=disable"
if fp4_type == "ootb" else "--gemm_plugin=nvfp4"
])
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run MMLU test")
acc_thres = 75
mmlu_cmd = [
"trtllm-eval", f"--model={engine_dir}",
f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
f"--dataset_path={mmlu_dataset_root}", "--check_accuracy",
f"--accuracy_threshold={acc_thres}"
]
check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)
@pytest.mark.skip_less_device(8)
@pytest.mark.parametrize("fp4_type", ["plugin", "ootb", "disable"],
ids=["fp4_plugin", "fp4_ootb", "disable_fp4"])
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-405b'], indirect=True)
def test_llm_llama_8gpu_fp4(mmlu_dataset_root, fp4_type, llama_example_root,
llama_model_root, llm_venv, engine_dir,
qcache_dir_without_install_package,
llm_datasets_root, upgrade_transformers):
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="nvfp4",
tp_size=8,
quantize_dir=qcache_dir_without_install_package)
print("Build engines...")
build_cmd = [
"trtllm-build", f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}", "--max_batch_size=32", "--workers=4"
]
if fp4_type != "disable":
build_cmd.extend([
"--gemm_plugin=disable"
if fp4_type == "ootb" else "--gemm_plugin=nvfp4"
])
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run MMLU test")
acc_thres = 75
mmlu_cmd = [
"trtllm-eval", f"--model={engine_dir}",
f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
f"--dataset_path={mmlu_dataset_root}", "--check_accuracy",
f"--accuracy_threshold={acc_thres}"
]
check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)
@pytest.mark.parametrize("num_beams", [1, 2, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("run_type", ['inference', 'summarization'])
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
@pytest.mark.parametrize("fp8_cache", [True, False],
ids=["enable_fp8", "disable_fp8"])
@pytest.mark.parametrize("llama_model_root", [
'llama-v2-7b-hf', 'llama-v3-8b-instruct-hf', 'llama-3.1-8b-instruct-hf-fp8'
],
indirect=True)
def test_llm_llama_1gpu(run_type, data_type, fp8_cache, llama_example_root,
llama_model_root, llm_datasets_root, llm_rouge_root,
llm_venv, cmodel_dir, engine_dir,
qcache_dir_without_install_package, num_beams):
if num_beams > 2 and get_device_memory() < 80000:
pytest.skip("device memory is insufficient.")
use_fp8 = fp8_cache if "fp8" not in llama_model_root.lower() else True
skip_fp8_pre_ada(use_fp8=use_fp8)
model_name = os.path.basename(llama_model_root)
if llama_model_root.endswith('Llama-3.1-8B-Instruct-FP8'):
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model="llama_v3_hf_fp8",
model_path=llama_model_root,
fp8_kv_cache=fp8_cache,
data_type=data_type)
elif fp8_cache:
# Quantize HF llama checkpoint into FP8 format
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype=data_type,
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
calib_size=512,
kv_cache_dtype="fp8")
else:
model_dir = convert_weights(
llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=data_type,
enable_fp8=fp8_cache,
fp8_kv_cache=fp8_cache,
quant_ckpt_path=
f"{qcache_dir_without_install_package}/quantized_fp8/llama_tp1_rank0.npz"
if fp8_cache else None)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gpt_attention_plugin={data_type}",
f"--gemm_plugin={data_type}",
"--remove_input_padding=enable",
f"--max_beam_width={num_beams}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
if run_type == "inference":
print("Run inference...")
venv_check_call(llm_venv, [
f"{llama_example_root}/../run.py",
"--max_output_len=50",
f"--tokenizer_dir={llama_model_root}",
f"--engine_dir={engine_dir}",
f"--num_beams={num_beams}",
])
elif run_type == "summarization":
print("Run summarize...")
tensorrt_llm_rouge1_threshold = {
1: 14,
2: 19,
4: 19,
}[num_beams]
summary_cmd = generate_summary_cmd(
llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
engine_dir=engine_dir,
tensorrt_llm_rouge1_threshold=tensorrt_llm_rouge1_threshold,
num_beams=num_beams,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_check_call(llm_venv, summary_cmd)
@skip_pre_ada
@pytest.mark.parametrize("data_type", ['bfloat16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_1gpu_fp8_kv_cache(
data_type,
llama_example_root,
llama_model_root,
llm_datasets_root,
llm_rouge_root,
llm_venv,
cmodel_dir,
engine_dir,
qcache_dir_without_install_package,
):
# Quantize HF llama checkpoint into FP8 format
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype=data_type,
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
calib_size=512,
kv_cache_dtype="fp8")
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gpt_attention_plugin={data_type}",
f"--gemm_plugin={data_type}",
"--remove_input_padding=enable",
"--use_paged_context_fmha=enable",
"--use_fp8_context_fmha=enable",
"--max_beam_width=1",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
with open(f"{engine_dir}/config.json") as f:
engine_config = json.load(f)
assert engine_config["build_config"]["plugin_config"][
"use_fp8_context_fmha"] == True
assert engine_config["pretrained_config"]["quantization"][
"kv_cache_quant_algo"] == "FP8"
@pytest.mark.parametrize("use_weight_sparsity", [True],
ids=["enable_weight_sparsity"])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_1gpu_sparsity(llama_example_root, llama_model_root,
llama_v2_tokenizer_model_root, llm_venv,
cmodel_dir, engine_dir,
use_weight_sparsity):
model_name = 'llama_v2'
data_type = 'float16'
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=data_type)
print("Build engines...")
build_cmd = [
"trtllm-build", f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}", "--log_level=verbose"
]
if use_weight_sparsity:
build_cmd.extend(["--weight_sparsity"])
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run inference...")
venv_check_call(llm_venv, [
f"{llama_example_root}/../run.py", "--max_output_len=50",
f"--tokenizer_dir={llama_v2_tokenizer_model_root}",
f"--engine_dir={engine_dir}", f"--num_beams=1"
])
@skip_post_blackwell
@pytest.mark.parametrize("num_beams", [1],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v3-8b-instruct-hf'],
indirect=True)
def test_llm_llama_v3_int8_gptq_1gpu_summary(data_type, llama_example_root,
llama_model_root,
llm_datasets_root, llm_rouge_root,
llm_venv, cmodel_dir, engine_dir,
num_beams):
if num_beams > 2 and get_device_memory() < 80000:
pytest.skip("device memory is insufficient.")
model_name = 'llama_v3-int8_gptq'
llama_gptq_safetensors_root = os.path.join(
llama_model_root, "../..", "int8-quantized-gptq",
"llama-3-8b-8bit-gs64-gptq.safetensors")
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=data_type,
quant_ckpt_path=llama_gptq_safetensors_root)
print("Build engines...")
build_cmd = [
"trtllm-build", f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}", f"--gpt_attention_plugin={data_type}",
f"--gemm_plugin={data_type}", "--remove_input_padding=enable",
f"--max_beam_width={num_beams}"
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
summary_cmd = [
f"{llama_example_root}/../summarize.py", "--test_trt_llm",
"--hf_model_dir", f"{llama_model_root}", "--data_type=fp16",
f"--engine_dir={engine_dir}", "--check_accuracy",
"--tensorrt_llm_rouge1_threshold=24", f"--num_beams={num_beams}",
f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
]
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("num_beams", [1],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['float16'])
@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
indirect=True)
def test_llm_llama_4gpu_pp4(data_type, llama_example_root, llama_model_root,
llm_datasets_root, llm_rouge_root, llm_venv,
cmodel_dir, engine_dir, num_beams):
model_name = os.path.basename(llama_model_root)
model_dir = convert_weights(
llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=data_type,
tp_size=1,
pp_size=4,
)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gemm_plugin={data_type}",
f"--max_beam_width={num_beams}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
tensorrt_llm_rouge1_threshold = {
1: 12,
}[num_beams]
summary_cmd = generate_summary_cmd(
llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
engine_dir=engine_dir,
tensorrt_llm_rouge1_threshold=tensorrt_llm_rouge1_threshold,
num_beams=num_beams,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_mpi_check_call(llm_venv, ["mpirun", "-n", "4", "--allow-run-as-root"],
summary_cmd)
@skip_pre_ada
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("num_beams", [1],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['bfloat16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_fp8_2gpu_pp2(
data_type, llama_example_root, llama_model_root,
llama_v2_tokenizer_model_root, llm_datasets_root, llm_rouge_root,
llm_venv, engine_dir, qcache_dir_without_install_package, num_beams):
if num_beams > 2 and get_device_memory() < 80000:
pytest.skip("device memory is insufficient.")
# Quantize HF llama checkpoint into FP8 format
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype=data_type,
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
tp_size=1,
pp_size=2,
kv_cache_dtype="fp8",
calib_size=64)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gpt_attention_plugin={data_type}",
f"--gemm_plugin={data_type}",
"--remove_input_padding=enable",
f"--max_beam_width={num_beams}",
"--use_paged_context_fmha=disable",
"--use_fp8_context_fmha=disable",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
tensorrt_llm_rouge1_threshold = {
1: 13,
2: 19,
4: 19,
}[num_beams]
summary_cmd = [
f"{llama_example_root}/../summarize.py", "--test_trt_llm",
"--hf_model_dir", f"{llama_v2_tokenizer_model_root}",
"--data_type=fp16", f"--engine_dir={engine_dir}",
f"--tensorrt_llm_rouge1_threshold={tensorrt_llm_rouge1_threshold}",
"--check_accuracy", f"--num_beams={num_beams}",
f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
]
venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
summary_cmd)
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_gather_logits_2gpu_pp2(llama_example_root,
llama_model_root,
llm_datasets_root, llm_rouge_root,
llama_v2_tokenizer_model_root,
llm_venv, cmodel_dir, engine_dir):
# Check the availability of gather all token logits when pp>1
model_name = 'llama_v2'
data_type = 'float16'
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=data_type,
pp_size=2)
print("Build engines...")
build_cmd = [
"trtllm-build", f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}", "--max_batch_size=2",
"--max_beam_width=1", f"--gemm_plugin={data_type}",
f"--gpt_attention_plugin={data_type}", "--gather_context_logits"
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
summary_cmd = [
f"{llama_example_root}/../summarize.py", "--test_trt_llm",
"--hf_model_dir", f"{llama_v2_tokenizer_model_root}",
"--data_type=fp16", f"--engine_dir={engine_dir}", "--eval_ppl",
f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
]
venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("llama_model_root", [
'llama-v2-7b-hf', 'llama-v2-13b-hf', 'llama-v2-70b-hf', 'Llama-2-7B-AWQ',
'Llama-2-7B-GPTQ'
],
indirect=True)
def test_llm_llama_v2_awq_2gpu_summary(llama_example_root, llama_model_root,
llama_v2_tokenizer_model_root,
llm_datasets_root, llm_rouge_root,
llm_venv, engine_dir, num_beams,
qcache_dir_without_install_package):
if (num_beams > 2
or "70b" in llama_model_root) and get_device_memory() < 80000:
pytest.skip("device memory is insufficient.")
if 'Llama-2-7B-AWQ' in llama_model_root or 'Llama-2-7B-GPTQ' in llama_model_root:
print("Converting model...")
ckpt_dir = convert_weights(
llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=qcache_dir_without_install_package,
model="llama_v2",
model_path=llama_model_root,
data_type="auto",
tp_size=2,
pp_size=1)
else:
print("Quantizing model...")
ckpt_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="int4_awq",
quantize_dir=qcache_dir_without_install_package,
tp_size=2,
calib_size=32)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=float16",
"--gemm_plugin=float16",
"--remove_input_padding=enable",
f"--max_beam_width={num_beams}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
summary_cmd = generate_summary_cmd(
llama_example_root,
hf_model_dir=llama_v2_tokenizer_model_root,
data_type="fp16",
engine_dir=engine_dir,
num_beams=num_beams,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
summary_cmd)
@skip_pre_ada
@skip_post_blackwell # AutoQ contains AWQ int4 recipe, which is not supported on Blackwell
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b'], indirect=True)
def test_llm_llama_v3_1_autoq_1gpu_mmlu(llama_example_root, llama_model_root,
llm_datasets_root, mmlu_dataset_root,
llm_venv, engine_dir,
qcache_dir_without_install_package):
print("Quantizing model...")
ckpt_dir = quantize_data(llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
quantize_dir=qcache_dir_without_install_package,
tp_size=1,
calib_size=4,
batch_size=4,
autoq_format='int4_awq,fp8,w4a8_awq',
auto_quantize_bits=5.8)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
"--remove_input_padding=enable",
"--max_batch_size=8",
"--max_input_len=4000",
"--max_seq_len=4096",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run MMLU test")
mmlu_cmd = [
"trtllm-eval", f"--model={engine_dir}",
f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
f"--dataset_path={mmlu_dataset_root}", "--check_accuracy",
f"--accuracy_threshold={63.8}"
]
check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-70b'], indirect=True)
def test_llm_llama_v3_1_autoq_2gpu_mmlu(llama_example_root, llama_model_root,
llm_datasets_root, mmlu_dataset_root,
llm_venv, engine_dir,
qcache_dir_without_install_package):
print("Quantizing model...")
ckpt_dir = quantize_data(llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
quantize_dir=qcache_dir_without_install_package,
tp_size=2,
calib_size=4,
batch_size=4,
autoq_format='int4_awq,fp8,w4a8_awq',
auto_quantize_bits=5.8)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
"--remove_input_padding=enable",
"--max_batch_size=8",
"--max_input_len=4000",
"--max_seq_len=4096",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run MMLU test")
mmlu_cmd = [
"trtllm-eval", f"--model={engine_dir}",
f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
f"--dataset_path={mmlu_dataset_root}", "--check_accuracy",
f"--accuracy_threshold={77.58}"
]
check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)
@skip_post_blackwell
@pytest.mark.skip_less_device(2)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("num_beams", [4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("llama_model_root", ['llama-7b', 'llama-30b'],
indirect=True)
def test_llm_llama_v1_2gpu_summary(llama_example_root, llama_model_root,
llm_datasets_root, llm_rouge_root, llm_venv,
cmodel_dir, engine_dir, num_beams):
model_name = 'llama_v1_2gpu'
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
gpus=2,
tp_size=2,
pp_size=1)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=float16",
"--gemm_plugin=float16",
"--remove_input_padding=enable",
f"--max_beam_width={num_beams}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
summary_cmd = [
f"{llama_example_root}/../../../summarize.py", "--test_trt_llm",
"--check_accuracy", f"--hf_model_dir={llama_model_root}",
f"--engine_dir={engine_dir}", f"--num_beams={num_beams}",
f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
]
venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
summary_cmd)
@pytest.mark.skip_less_device(8)
@pytest.mark.skip_less_host_memory(480000)
@pytest.mark.parametrize("num_beams", [1, 2, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("llama_model_root", ['llama-v2-70b'], indirect=True)
def test_llm_llama_v2_8gpu_summary(llama_example_root, llama_model_root,
llama_v2_tokenizer_model_root,
llm_datasets_root, llm_rouge_root, llm_venv,
cmodel_dir, engine_dir, num_beams):
"run llamav2 70 test on 8 gpus"
if num_beams > 2 and get_device_memory() < 80000:
pytest.skip("device memory is insufficient.")
model_name = 'llama_v2-meta-ckpt-70b'
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
gpus=8,
workers=8,
tp_size=8,
pp_size=1)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=float16",
"--gemm_plugin=float16",
"--remove_input_padding=enable",
f"--max_beam_width={num_beams}",
"--workers=8",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
summary_cmd = generate_summary_cmd(
llama_example_root,
hf_model_dir=llama_v2_tokenizer_model_root,
data_type="fp16",
engine_dir=engine_dir,
num_beams=num_beams,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_mpi_check_call(llm_venv, ["mpirun", "-n", "8", "--allow-run-as-root"],
summary_cmd)
@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("num_beams", [2, 5],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_v1_1gpu_paged_kv_cache(llama_example_root, llama_model_root,
llm_datasets_root, llm_rouge_root,
llm_venv, cmodel_dir, engine_dir,
num_beams):
"RCCA https://nvbugs/4283902"
print("Build engines...")
model_name = 'llama_v1-paged_kv_cache'
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--max_beam_width={num_beams}",
"--gpt_attention_plugin=float16",
"--gemm_plugin=float16",
"--remove_input_padding=enable",
"--max_batch_size=2",
"--tokens_per_block=16",
"--paged_kv_cache=enable",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run inference")
summary_cmd = [
f"{llama_example_root}/../../../summarize.py", "--test_trt_llm",
"--hf_model_dir", f"{llama_model_root}", "--data_type", "fp16",
"--check_accuracy", f"--engine_dir={engine_dir}",
f"--num_beams={num_beams}", f"--dataset_dir={llm_datasets_root}",
f"--rouge_dir={llm_rouge_root}"
]
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b'], indirect=True)
def test_llm_llama_v1_4gpu_paged_kv_cache(llama_example_root, llama_model_root,
llm_venv, cmodel_dir, engine_dir):
"""
RCCA https://nvbugs/4251782
RCCA https://nvbugs/4755248
"""
model_name = 'llama_v1-4gpu_paged_kv_cache'
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
gpus=4,
tp_size=4,
pp_size=1)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=float16",
"--remove_input_padding=enable",
"--gemm_plugin=float16",
"--max_batch_size=128",
"--max_input_len=512",
"--max_seq_len=1024",
"--max_beam_width=1",
"--paged_kv_cache=enable",
]
print("Build engines...")
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run inference")
run_cmd = [
f"{llama_example_root}/../../../run.py",
"--max_output_len=10",
f"--tokenizer_dir={llama_model_root}",
f"--engine_dir={engine_dir}",
"--max_attention_window_size=128",
"--kv_cache_enable_block_reuse",
]
venv_mpi_check_call(llm_venv, ["mpirun", "-n", "4", "--allow-run-as-root"],
run_cmd)
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_v1_1gpu_kv_cache_reuse_with_prompt_table(
llama_example_root, llama_model_root, llm_datasets_root, llm_rouge_root,
llm_venv, cmodel_dir, engine_dir):
max_prompt_embedding_table_size = 16
hidden_size = 4096
vocab_size = 32000
input_len = 42
print("Convert checkpoint...")
model_name = 'llama_v1-kv_cache_reuse_w_prompt_table'
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root)
print("Build engines...")
build_cmd = [
"trtllm-build", f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}/engines", "--gpt_attention_plugin=float16",
"--gemm_plugin=float16", "--remove_input_padding=enable",
"--max_batch_size=1",
f"--tokens_per_block={max_prompt_embedding_table_size}",
"--paged_kv_cache=enable", "--use_paged_context_fmha=enable",
f"--max_prompt_embedding_table_size={max_prompt_embedding_table_size}"
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
# generate input ids, dummy prompt table and extra ids
input_file = f"{engine_dir}/input_ids.npy"
prompt_table_path = f"{engine_dir}/prompt_table.npy"
extra_ids_file = f"{engine_dir}/extra_ids.npy"
# run the script inside venv since it depends on numpy
venv_script = f'''
import numpy as np
input_ids = [[
i + {vocab_size} if i < {max_prompt_embedding_table_size} else i + 1000
for i in range({input_len})
]]
np.save("{input_file}", np.array(input_ids))
prompt_table_shape = (1, {max_prompt_embedding_table_size}, {hidden_size})
prompt_table = np.random.rand(*prompt_table_shape).astype(np.float16)
np.save("{prompt_table_path}", prompt_table)
extra_ids = [[
1 if i < {max_prompt_embedding_table_size} else 0
for i in range({input_len})
]]
np.save("{extra_ids_file}", np.array(extra_ids))
'''
llm_venv.run(venv_script)
# add --run_profiling to run the request for multiple times
print("Run inference")
run_cmd = [
f"{llama_example_root}/../../../run.py", "--max_output_len=10",
f"--tokenizer_dir={llama_model_root}",
f"--engine_dir={engine_dir}/engines", f"--input_file={input_file}",
f"--prompt_table_path={prompt_table_path}",
"--kv_cache_enable_block_reuse",
f"--input_token_extra_ids_file={extra_ids_file}", "--run_profiling"
]
venv_check_output(llm_venv, run_cmd)
@pytest.mark.skip_less_device(2)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize(
"fp8_context_fmha_xqa",
["enable_fp8_context_fmha_xqa", "disable_fp8_context_fmha_xqa"])
@pytest.mark.parametrize("reduce_fusion",
["enable_reduce_fusion", "disable_reduce_fusion"])
@pytest.mark.parametrize("llama_model_root",
['llama-7b', 'llama-v2-13b-hf', 'llama-v2-70b-hf'],
indirect=True)
def test_llm_llama_2gpu_fp8_summary(llama_example_root, llama_model_root,
llm_datasets_root, llm_rouge_root, llm_venv,
engine_dir,
qcache_dir_without_install_package,
fp8_context_fmha_xqa, reduce_fusion):
"RCCA https://nvbugs/4348560"
skip_fp8_pre_ada(use_fp8=True)
qmodel_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
tp_size=2,
calib_size=512,
kv_cache_dtype="fp8")
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={qmodel_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=float16",
"--gemm_plugin=float16",
"--remove_input_padding=enable",
"--workers=2",
"--max_beam_width=4",
]
if "enable" in fp8_context_fmha_xqa:
build_cmd.extend([
"--use_fp8_context_fmha=enable", "--use_paged_context_fmha=enable"
])
if "enable" in reduce_fusion:
build_cmd.extend(["--reduce_fusion=enable"])
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run inference")
summary_cmd = generate_summary_cmd(llama_example_root,
hf_model_dir=llama_model_root,
data_type='fp16',
engine_dir=engine_dir,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root,
num_beams=4)
venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
summary_cmd)
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_1gpu_batched_beam_search(llama_example_root,
llama_model_root, llm_datasets_root,
llm_venv, engine_dir,
qcache_dir_without_install_package):
"llama run batched beam search on 1 gpu"
qmodel_dir = quantize_data(llm_venv,
llama_example_root,
model_dir=llama_model_root,
dtype="float16",
quantize_dir=qcache_dir_without_install_package)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={qmodel_dir}",
f"--output_dir={engine_dir}",
f"--gpt_attention_plugin=float16",
"--remove_input_padding=enable",
"--paged_kv_cache=enable",
"--max_batch_size=4",
"--max_beam_width=4",
"--max_input_len=512",
"--max_seq_len=532",
"--gemm_plugin=float16",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
# run.py test.
num_beams = 4
run_cmd = [
f"{llama_example_root}/../../../run.py",
"--max_output_len=20",
f"--tokenizer_dir={llama_model_root}",
f"--engine_dir={engine_dir}",
"--no_add_special_tokens",
f"--num_beams={num_beams}",
"--input_text",
"Miguel de Cervantes wrote",
"Diego Velazquez painted his most famous painting,",
"Miguel de Cervantes wrote",
"Diego Velazquez painted his most famous painting,",
]
output = venv_check_output(llm_venv, run_cmd)
output = parse_output(output)
for idx in [0, 1]:
assert all(
[
a == b for (a, b) in zip(
output[num_beams * idx:num_beams * idx +
num_beams], output[num_beams * (idx + 2):num_beams *
(idx + 2) + num_beams])
]
), f"outputs {idx} and {idx+2} don't match: {output[num_beams * idx:num_beams * idx + num_beams]}, {output[num_beams * (idx + 2):num_beams * (idx + 2) + num_beams]}"
expected_output = [
["Don Quixote in 1605. The book is considered the first modern novel."],
[
"Las Meninas, in 1656. The painting is a portrait of King Philip IV",
"\"Las Meninas\" in 1656. The painting depicts King Philip"
],
]
for idx, result in enumerate(output):
assert any(
[
similar(item, result)
for item in expected_output[(idx // num_beams) % 2]
]
), f"output {result} is not similar to any of {expected_output[(idx // num_beams) % 2]}"
@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("mmlu_test", [True, False],
ids=["enable_mmlu_test", "disable_mmlu_test"])
@pytest.mark.parametrize(
"fp8_fmha",
["enable_fp8_fmha", "enable_fp8_paged_fmha", "disable_fp8_fmha"])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_1gpu_fp8_summary_and_mmlu(
llama_example_root, llama_model_root, llm_datasets_root, llm_rouge_root,
mmlu_dataset_root, mmlu_test, llm_venv, engine_dir,
qcache_dir_without_install_package, fp8_fmha):
"run Llama v2 fp8 quantization tests"
skip_fp8_pre_ada(use_fp8=True)
qmodel_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="bfloat16",
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
calib_size=512,
kv_cache_dtype="fp8")
print("Build engines...")
use_fp8_context_fmha = "enable" if fp8_fmha in [
"enable_fp8_fmha", "enable_fp8_paged_fmha"
] else "disable"
use_paged_context_fmha = "enable" if fp8_fmha == "enable_fp8_paged_fmha" else "disable"
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={qmodel_dir}",
f"--output_dir={engine_dir}",
f"--use_fp8_context_fmha={use_fp8_context_fmha}",
f"--use_paged_context_fmha={use_paged_context_fmha}",
"--remove_input_padding=enable",
"--max_batch_size=4",
"--max_input_len=2046",
"--max_seq_len=2048",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
# run.py test.
run_cmd = [
f"{llama_example_root}/../../../run.py",
"--max_output_len=32",
f"--tokenizer_dir={llama_model_root}",
f"--engine_dir={engine_dir}",
"--no_add_special_tokens",
"--input_text",
INPUT_TEXT_1,
INPUT_TEXT_2,
INPUT_TEXT_2,
]
output = venv_check_output(llm_venv, run_cmd)
output = parse_output(output)
print(output)
print("Run Summarization test with batch size = 1")
summary_cmd = [
f"{llama_example_root}/../../../summarize.py",
"--test_trt_llm",
"--hf_model_dir",
f"{llama_model_root}",
"--data_type",
"fp16",
f"--engine_dir={engine_dir}",
"--check_accuracy",
f"--dataset_dir={llm_datasets_root}",
f"--rouge_dir={llm_rouge_root}",
# rouge1 threshold reduced from 15 (default) to 14 since we now enable fused mlp by default and the scales of two linear layers can be different
"--tensorrt_llm_rouge1_threshold=14",
]
venv_check_call(llm_venv, summary_cmd)
if mmlu_test:
print("Run MMLU test")
mmlu_cmd = [
"trtllm-eval", f"--model={engine_dir}",
f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
f"--dataset_path={mmlu_dataset_root}", "--check_accuracy",
f"--accuracy_threshold={45.0}"
]
check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)
@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_1gpu_fp8_gemv(llama_example_root, llama_model_root,
llm_datasets_root, llm_venv, engine_dir,
qcache_dir_without_install_package):
"run Llama v2 fp8 quantization tests"
skip_fp8_pre_ada(use_fp8=True)
qmodel_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="bfloat16",
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
calib_size=512,
kv_cache_dtype="fp8")
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={qmodel_dir}",
f"--output_dir={engine_dir}",
f"--gemm_plugin=fp8",
"--max_batch_size=4",
"--max_input_len=2048",
"--max_seq_len=2048",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
# run.py test.
run_cmd = [
f"{llama_example_root}/../../../run.py",
"--max_output_len=32",
f"--tokenizer_dir={llama_model_root}",
f"--engine_dir={engine_dir}",
"--no_add_special_tokens",
"--input_text",
INPUT_TEXT_1,
INPUT_TEXT_2,
INPUT_TEXT_2,
]
output = venv_check_output(llm_venv, run_cmd)
output = parse_output(output)
print(output)
print("Run Summarization test with batch size = 1")
summary_cmd = [
f"{llama_example_root}/../../../summarize.py",
"--test_trt_llm",
"--hf_model_dir",
f"{llama_model_root}",
"--data_type",
"fp16",
f"--engine_dir={engine_dir}",
"--check_accuracy",
f"--dataset_dir={llm_datasets_root}",
"--tensorrt_llm_rouge1_threshold=14.5",
]
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
@pytest.mark.parametrize("gemm_swiglu_plugin", ["fp8"])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_1gpu_gemm_swiglu(llama_example_root, llama_model_root,
llm_datasets_root, llm_venv, engine_dir,
qcache_dir_without_install_package,
gemm_swiglu_plugin, data_type):
"run Llama v2 gemm_swiglu_plugin tests"
if gemm_swiglu_plugin == "fp8":
skip_fp8_pre_ada(use_fp8=True)
qmodel_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype=data_type,
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
calib_size=512,
kv_cache_dtype="fp8")
else:
pytest.skip(f"gemm_swiglu_plugin only supports fp8 now")
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={qmodel_dir}",
f"--output_dir={engine_dir}",
f"--gpt_attention_plugin={data_type}",
f"--gemm_plugin=fp8",
f"--gemm_swiglu_plugin={gemm_swiglu_plugin}",
"--remove_input_padding=enable",
"--max_batch_size=4",
"--max_input_len=2048",
"--max_seq_len=2048",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
# run.py test.
run_cmd = [
f"{llama_example_root}/../../../run.py",
"--max_output_len=32",
f"--tokenizer_dir={llama_model_root}",
f"--engine_dir={engine_dir}",
"--no_add_special_tokens",
"--input_text",
INPUT_TEXT_1,
INPUT_TEXT_2,
INPUT_TEXT_2,
]
output = venv_check_output(llm_venv, run_cmd)
output = parse_output(output)
print(output)
print("Run Summarization test")
summary_cmd = [
f"{llama_example_root}/../../../summarize.py",
"--test_trt_llm",
"--hf_model_dir",
f"{llama_model_root}",
"--data_type",
"fp16",
f"--engine_dir={engine_dir}",
"--check_accuracy",
"--max_ite=40",
f"--dataset_dir={llm_datasets_root}",
]
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.parametrize(
"data_type", [
'float16', 'fp8',
pytest.param('sq_ootb', marks=skip_post_blackwell),
pytest.param('awq', marks=skip_post_blackwell),
pytest.param('int8_wo', marks=skip_post_blackwell)
],
ids=['base_fp16', 'base_fp8', 'base_sq_ootb', 'base_awq', 'base_int8_wo'])
@pytest.mark.parametrize("lora_data_type", ['float16'], ids=['lora_fp16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-13b-hf'], indirect=True)
@pytest.mark.parametrize("llm_lora_model_root", ['chinese-llama-2-lora-13b'],
indirect=True)
def test_llm_llama_v2_lora_1gpu(data_type, lora_data_type, llama_example_root,
llama_model_root, llm_datasets_root, llm_venv,
cmodel_dir, engine_dir, llm_lora_model_root,
qcache_dir_without_install_package):
"run llama lora test on 1gpu"
print("Build engines...")
model_name = 'llama_v2-lora'
if data_type == 'fp8':
skip_fp8_pre_ada(use_fp8=True)
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
calib_size=512,
kv_cache_dtype="fp8")
elif data_type == 'sq_ootb':
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="int8_sq",
quantize_dir=qcache_dir_without_install_package,
calib_size=32)
elif data_type == 'awq':
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="int4_awq",
awq_block_size=128,
quantize_dir=qcache_dir_without_install_package,
calib_size=32)
elif data_type == 'int8_wo':
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
use_weight_only=True,
weight_only_precision='int8')
else:
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
"--lora_plugin=auto",
"--gemm_plugin=auto",
f"--lora_dir={llm_lora_model_root}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
ref_1 = [
29871, 32160, 33657, 33281, 30214, 30672, 30780, 33820, 32024, 30214,
32083, 33820, 30755, 37432, 32030, 30313, 30214, 30417, 30210, 30505,
34870, 30214, 30417, 30210, 30505, 31656, 39298, 30214, 32063, 30210
]
ref_2 = [
29871, 32160, 33657, 33281, 30214, 30672, 30780, 33820, 32024, 30214,
33759, 41026, 31381, 30769, 31811, 31900, 30214, 36869, 31900, 36869,
31900, 30214, 36869, 31900, 36869, 31900, 31900, 31900, 31900, 31900
]
input_text = "今天天气很好,我到公园的时候,"
# TODO change to chinese evaluation task in the future
base_run_cmd = [
f"{llama_example_root}/../../../run.py",
"--max_output_len=20",
f"--input_text={input_text}",
f"--tokenizer_dir={llm_lora_model_root}",
f"--engine_dir={engine_dir}",
"--no_add_special_tokens",
]
for use_py_session in [True, False]:
if use_py_session:
print("Run inference with Python runtime...")
else:
print("Run inference with C++ runtime...")
print(f"Run inference with lora id 0...")
run_cmd = copy.deepcopy(base_run_cmd)
run_cmd.extend([
"--lora_task_uids=0",
f"--output_csv={llm_venv.get_working_directory()}/use_lora.csv"
])
if use_py_session:
run_cmd.append("--use_py_session")
venv_check_call(llm_venv, run_cmd)
with open(f"{llm_venv.get_working_directory()}/use_lora.csv") as f:
predict = csv.reader(f)
predict = next(predict)
predict = [int(p) for p in predict]
assert ref_1 == predict or data_type != "float16"
print(f"Run inference with lora id -1...")
run_cmd = copy.deepcopy(base_run_cmd)
run_cmd.extend([
"--lora_task_uids=-1",
f"--output_csv={llm_venv.get_working_directory()}/no_lora.csv"
])
if use_py_session:
run_cmd.append("--use_py_session")
venv_check_call(llm_venv, run_cmd)
with open(f"{llm_venv.get_working_directory()}/no_lora.csv") as f:
predict = csv.reader(f)
predict = next(predict)
predict = [int(p) for p in predict]
assert ref_2 == predict or data_type != "float16"
@pytest.mark.parametrize(
"data_type", ['float16', 'fp8', 'sq_ootb', 'awq', 'int8_wo'],
ids=['base_fp16', 'base_fp8', 'base_sq_ootb', 'base_awq', 'base_int8_wo'])
@pytest.mark.parametrize("llama_model_root", ['llama-v3-8b-hf'], indirect=True)
@pytest.mark.parametrize("llm_dora_model_root",
['commonsense-llama-v3-8b-dora-r32'],
indirect=True)
def test_llm_llama_v3_dora_1gpu(data_type, llama_example_root, llama_model_root,
llm_dora_model_root, llm_datasets_root,
llm_venv, cmodel_dir, engine_dir,
qcache_dir_without_install_package):
"run llama dora test on 1gpu"
print("Build engines...")
model_name = 'llama_v3-dora'
if data_type == 'fp8':
skip_fp8_pre_ada(use_fp8=True)
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
calib_size=512,
kv_cache_dtype="fp8")
elif data_type == 'sq_ootb':
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="int8_sq",
quantize_dir=qcache_dir_without_install_package,
calib_size=32)
elif data_type == 'awq':
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="int4_awq",
awq_block_size=128,
quantize_dir=qcache_dir_without_install_package,
calib_size=32)
elif data_type == 'int8_wo':
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
use_weight_only=True,
weight_only_precision='int8')
else:
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root)
# normalize dora magnitude
dora_weights = f"{llm_venv.get_working_directory()}/dora_weights"
normalize_cmd = [
f"{llama_example_root}/../../../dora/normalize_weights.py", "-i",
llm_dora_model_root, "-b", llama_model_root, "-o", dora_weights
]
venv_check_call(llm_venv, normalize_cmd)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
"--lora_plugin=auto",
"--dora_plugin=enable",
"--remove_input_padding=enable", # otherwise no cpp runtime
"--kv_cache_type=paged", # otherwise no cpp runtime
"--gemm_plugin=auto",
f"--lora_dir={dora_weights}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
input_tokens = [
128000, 39314, 374, 459, 7754, 430, 16964, 264, 3465, 11, 35526, 449,
459, 1988, 430, 5825, 4726, 2317, 13, 9842, 264, 2077, 430, 36001,
45695, 279, 1715, 382, 394, 17010, 30151, 512, 394, 5321, 5268, 279,
4495, 4320, 311, 279, 3488, 25, 578, 842, 1121, 304, 279, 1920, 315,
7397, 74767, 374, 279, 5788, 315, 13465, 323, 24463, 13, 16299, 3094,
17738, 279, 7314, 315, 7397, 74767, 31931, 16533, 16, 25, 36424, 4907,
374, 42101, 1555, 279, 20282, 13, 22559, 17, 25, 8828, 4907, 374, 16489,
311, 11742, 4907, 13, 22559, 18, 25, 92479, 5237, 25734, 304, 279,
16312, 41255, 3177, 4907, 13, 22559, 19, 25, 8219, 4238, 374, 16489,
1139, 37833, 5237, 25734, 4286, 16533, 3645, 25, 4320, 16, 14, 9399, 17,
14, 9399, 18, 14, 9399, 19, 271, 394, 17010, 5688, 512, 72348, 394,
17010, 6075, 1473
]
out_ref = [
128000, 39314, 374, 459, 7754, 430, 16964, 264, 3465, 11, 35526, 449,
459, 1988, 430, 5825, 4726, 2317, 13, 9842, 264, 2077, 430, 36001,
45695, 279, 1715, 382, 394, 17010, 30151, 512, 394, 5321, 5268, 279,
4495, 4320, 311, 279, 3488, 25, 578, 842, 1121, 304, 279, 1920, 315,
7397, 74767, 374, 279, 5788, 315, 13465, 323, 24463, 13, 16299, 3094,
17738, 279, 7314, 315, 7397, 74767, 31931, 16533, 16, 25, 36424, 4907,
374, 42101, 1555, 279, 20282, 13, 22559, 17, 25, 8828, 4907, 374, 16489,
311, 11742, 4907, 13, 22559, 18, 25, 92479, 5237, 25734, 304, 279,
16312, 41255, 3177, 4907, 13, 22559, 19, 25, 8219, 4238, 374, 16489,
1139, 37833, 5237, 25734, 4286, 16533, 3645, 25, 4320, 16, 14, 9399, 17,
14, 9399, 18, 14, 9399, 19, 271, 394, 17010, 5688, 512, 72348, 394,
17010, 6075, 1473, 394, 279, 4495, 4320, 374, 4320, 18, 128001, 128001,
128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
128001, 128001, 128001, 128001, 128001
]
in_csv = f"{llm_venv.get_working_directory()}/input.csv"
out_csv = f"{llm_venv.get_working_directory()}/output.csv"
with open(in_csv, "w") as f:
writer = csv.writer(f)
writer.writerow(input_tokens)
base_run_cmd = [
f"{llama_example_root}/../../../run.py", "--max_output_len=20",
f"--input_file={in_csv}", f"--tokenizer_dir={llama_model_root}",
f"--engine_dir={engine_dir}", "--max_output_len=32"
]
for use_py_session in [True, False]:
if use_py_session:
print("Run inference with Python runtime...")
else:
print("Run inference with C++ runtime...")
print(f"Run inference with lora id 0...")
run_cmd = copy.deepcopy(base_run_cmd)
run_cmd.extend(["--lora_task_uids=0", f"--output_csv={out_csv}"])
if use_py_session:
run_cmd.append("--use_py_session")
venv_check_call(llm_venv, run_cmd)
with open(out_csv) as f:
predict = csv.reader(f)
predict = next(predict)
predict = [int(p) for p in predict]
assert out_ref == predict or data_type != "float16"
@pytest.mark.skip_less_device(8)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize(
"tp_pp_size", [(8, 1), (4, 2)],
ids=lambda tp_pp_size: f'tp{tp_pp_size[0]}pp{tp_pp_size[1]}')
@pytest.mark.parametrize("test_case", ["pg64317"], indirect=True)
def test_llm_llama_long_alpaca_8gpu_summary(llama_example_root,
llm_long_alpaca_model_root,
llm_datasets_root, llm_rouge_root,
llm_venv, cmodel_dir, engine_dir,
num_beams, tp_pp_size, test_case):
"llama test for long alpaca"
tp_size, pp_size = tp_pp_size
world_size = 8
assert tp_size * pp_size == world_size, \
f'tp_size({tp_size}) x pp_size({pp_size}) != 8'
model_name = 'llama_long_alpaca'
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llm_long_alpaca_model_root,
gpus=world_size,
tp_size=tp_size,
pp_size=pp_size,
data_type="bfloat16")
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=bfloat16",
"--remove_input_padding=enable",
"--gemm_plugin=bfloat16",
f"--max_beam_width={num_beams}",
"--max_input_len=32768",
"--max_seq_len=49152",
"--max_batch_size=1",
"--max_num_tokens=32768",
]
print("Build engines...")
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
max_output_len = test_case["max_output_len"]
run_cmd = [
f"{llama_example_root}/../../../run.py",
f"--max_output_len={max_output_len}",
f"--input_file={test_case['input_file']}", f"--engine_dir={engine_dir}",
f"--num_beams={num_beams}",
f"--tokenizer_dir={llm_long_alpaca_model_root}",
"--max_input_length=32768"
]
venv_mpi_check_call(
llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
run_cmd)
summary_cmd = generate_summary_cmd(llama_example_root,
hf_model_dir=llm_long_alpaca_model_root,
max_input_length=16384,
output_len=max_output_len,
data_type="fp16",
num_beams=num_beams,
engine_dir=engine_dir,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_mpi_check_call(
llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
summary_cmd)
@pytest.mark.skip_less_device_memory(40000)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("gemm_plugin", [True, False],
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_v1_1gpu_streaming_llm(llama_example_root, llama_model_root,
llm_datasets_root, llm_rouge_root,
llm_venv, cmodel_dir, engine_dir,
num_beams, gemm_plugin):
"Run LLaMa with StreamingLLM"
model_name = 'llama_v1-streamingllm'
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=float16",
"--remove_input_padding=enable",
"--context_fmha=enable",
f"--max_beam_width={num_beams}",
"--streamingllm=enable",
"--max_batch_size=256",
]
if gemm_plugin:
build_cmd.append("--gemm_plugin=float16")
else:
build_cmd.append("--gemm_plugin=disable")
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run inference")
summary_cmd = generate_summary_cmd(llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
engine_dir=engine_dir,
max_attention_window_size=2048,
sink_token_length=4,
num_beams=num_beams,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.skip_less_device_memory(40000)
@pytest.mark.parametrize("num_beams", [1, 2, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize(
"gpt_attention_plugin", [True, False],
ids=["enable_attention_plugin", "disable_attention_plugin"])
@pytest.mark.parametrize("gemm_plugin", [True, False],
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
@pytest.mark.parametrize(
"context_fmha_type",
["enable_context_fmha", "enable_with_fp32_acc", "disable_context_fmha"])
@pytest.mark.parametrize("code_llama_model_root", ['CodeLlama-7b-Instruct'],
indirect=True)
def test_llm_llama_code_llama_1gpu_summary(
llama_example_root, code_llama_model_root, llm_datasets_root,
llm_rouge_root, llm_venv, cmodel_dir, engine_dir, num_beams,
gemm_plugin, gpt_attention_plugin, context_fmha_type):
"Run CodeLlaMa on single gpu"
model_name = 'code_llama_1gpu'
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=code_llama_model_root,
data_type="float16",
gpus=1,
tp_size=1,
pp_size=1)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--max_batch_size={1}",
f"--max_input_len={1024}",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--max_beam_width={num_beams}",
f"--max_seq_len={8192}",
]
if gpt_attention_plugin:
build_cmd.extend(
["--remove_input_padding=enable", "--gpt_attention_plugin=float16"])
else:
build_cmd.append("--gpt_attention_plugin=disable")
build_cmd.append("--remove_input_padding=disable")
build_cmd.append("--paged_kv_cache=disable")
if gemm_plugin:
build_cmd.append("--gemm_plugin=float16")
else:
build_cmd.append("--gemm_plugin=disable")
if context_fmha_type == "enable_context_fmha":
build_cmd.append("--context_fmha=enable")
elif context_fmha_type == "disable_context_fmha":
build_cmd.append("--context_fmha=disable")
print("Build engines...")
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
run_cmd = [
f"{llama_example_root}/../../../run.py",
"--max_output_len=40",
f"--tokenizer_dir={code_llama_model_root}",
f"--engine_dir={engine_dir}",
f"--num_beams={num_beams}",
"--input_text='In Bash, how do I list all text files?'",
]
if context_fmha_type == "enable_with_fp32_acc":
run_cmd.append("--enable_context_fmha_fp32_acc")
venv_check_call(llm_venv, run_cmd)
print("Run inference")
summary_cmd = generate_summary_cmd(llama_example_root,
hf_model_dir=code_llama_model_root,
data_type="fp16",
num_beams=num_beams,
tensorrt_llm_rouge1_threshold=17,
engine_dir=engine_dir,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell_ultra
@pytest.mark.timeout(7200)
@pytest.mark.skip_less_device_memory(40000)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize(
"tp_pp_size", [(4, 1), (2, 2), (8, 1), (4, 2)],
ids=lambda tp_pp_size: f'tp{tp_pp_size[0]}pp{tp_pp_size[1]}')
@pytest.mark.parametrize("code_llama_model_root",
['CodeLlama-34b-Instruct', 'CodeLlama-70b-hf'],
indirect=True)
def test_llm_llama_code_llama_multi_gpus_summary(llama_example_root,
code_llama_model_root,
llm_datasets_root,
llm_rouge_root, llm_venv,
cmodel_dir, engine_dir,
num_beams, tp_pp_size):
"Run CodeLlaMa on 4 gpus"
tp_size, pp_size = tp_pp_size
world_size = tp_size * pp_size
if get_device_count() < world_size:
pytest.skip(f"devices are less than {world_size}.")
model_name = 'code_llama'
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=code_llama_model_root,
data_type="float16",
gpus=world_size,
tp_size=tp_size,
pp_size=pp_size)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=float16",
"--remove_input_padding=enable",
"--gemm_plugin=float16",
"--context_fmha=enable",
f"--max_beam_width={num_beams}",
f"--workers={world_size}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
run_cmd = [
f"{llama_example_root}/../../../run.py",
"--max_output_len=160",
f"--tokenizer_dir={code_llama_model_root}",
f"--engine_dir={engine_dir}",
f"--num_beams={num_beams}",
"--input_text='In python, write a function for binary searching an element in an integer array.'",
]
venv_mpi_check_call(
llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
run_cmd)
print("Run inference")
tensorrt_llm_rouge1_threshold = 18 if "70b" in code_llama_model_root else 22
summary_cmd = generate_summary_cmd(
llama_example_root,
hf_model_dir=code_llama_model_root,
data_type="fp16",
num_beams=num_beams,
tensorrt_llm_rouge1_threshold=tensorrt_llm_rouge1_threshold,
engine_dir=engine_dir,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_mpi_check_call(
llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("per_token_channel", [True, False],
ids=["enable_ptpc", "disable_ptpc"])
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
@pytest.mark.parametrize("data_type", ["float16", "bfloat16"])
def test_llm_llama_smooth_quant_1gpu_summary(llama_example_root,
llama_model_root,
llm_datasets_root, llm_rouge_root,
llm_venv, engine_dir, num_beams,
per_token_channel, cmodel_dir,
data_type):
"Run smooth quant on single gpu"
model_dir = convert_weights(
llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model="llama-smooth",
model_path=llama_model_root,
gpus=1,
smoothquant=0.55,
per_token=per_token_channel,
per_channel=per_token_channel,
calib_dataset=f"{llm_datasets_root}/ccdv/cnn_dailymail",
data_type=data_type)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gpt_attention_plugin={data_type}",
"--remove_input_padding=enable",
f"--gemm_plugin={data_type}",
"--context_fmha=enable",
f"--max_beam_width={num_beams}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run inference")
rouge1_threshold = 17
summary_cmd = generate_summary_cmd(
llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
num_beams=num_beams,
tensorrt_llm_rouge1_threshold=rouge1_threshold,
engine_dir=engine_dir,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("use_weight_only", [True, False],
ids=['enable_weight_only', 'disable_weight_only'])
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_int8_kv_1gpu_summary(llama_example_root, llama_model_root,
llm_datasets_root, llm_rouge_root,
llm_venv, engine_dir, num_beams,
use_weight_only,
qcache_dir_without_install_package):
print("Quantizing model...")
qformat = "int8_wo" if use_weight_only else "full_prec"
ckpt_dir = quantize_data(llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat=qformat,
quantize_dir=qcache_dir_without_install_package,
calib_size=32,
kv_cache_dtype="int8")
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=float16",
"--remove_input_padding=enable",
"--gemm_plugin=float16",
f"--max_beam_width={num_beams}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run inference")
summary_cmd = generate_summary_cmd(llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
num_beams=num_beams,
tensorrt_llm_rouge1_threshold=19,
engine_dir=engine_dir,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_int8_sq_ootb_1gpu_summary(
llama_example_root, llama_model_root, llm_datasets_root, llm_rouge_root,
llm_venv, engine_dir, num_beams, qcache_dir_without_install_package):
print("Quantizing model...")
ckpt_dir = quantize_data(llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="int8_sq",
quantize_dir=qcache_dir_without_install_package,
calib_size=32,
kv_cache_dtype="int8")
print("Build engines...")
build_cmd = [
"trtllm-build", f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}", "--gpt_attention_plugin=float16",
"--remove_input_padding=enable", "--gemm_plugin=disable",
f"--max_beam_width={num_beams}"
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run inference")
summary_cmd = generate_summary_cmd(
llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
num_beams=num_beams,
tensorrt_llm_rouge1_threshold=
15.2, #Adjust to 15.2 for using TRT build optimization level 3
engine_dir=engine_dir,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("num_beams", [1],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['bfloat16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_int8sq_2gpu_tp2(data_type, llama_example_root,
llama_model_root,
llama_v2_tokenizer_model_root,
llm_datasets_root, llm_rouge_root,
llm_venv, engine_dir, num_beams,
qcache_dir_without_install_package):
if num_beams > 2 and get_device_memory() < 80000:
pytest.skip("device memory is insufficient.")
# Quantize HF llama checkpoint into int8_sq format
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype=data_type,
qformat="int8_sq",
quantize_dir=qcache_dir_without_install_package,
tp_size=2,
pp_size=1,
calib_size=32)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gpt_attention_plugin={data_type}",
f"--gemm_plugin={data_type}",
"--remove_input_padding=enable",
f"--max_beam_width={num_beams}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
summary_cmd = [
f"{llama_example_root}/../../../summarize.py",
"--test_trt_llm",
"--hf_model_dir",
f"{llama_v2_tokenizer_model_root}",
"--data_type=fp16",
f"--engine_dir={engine_dir}",
"--tensorrt_llm_rouge1_threshold=15",
"--check_accuracy",
f"--num_beams={num_beams}",
f"--dataset_dir={llm_datasets_root}",
f"--rouge_dir={llm_rouge_root}",
]
venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("weight_only_precision", ["int4", "int8"])
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_wo_1gpu_summary(llama_example_root, llama_model_root,
llm_datasets_root, llm_rouge_root, llm_venv,
engine_dir, num_beams, cmodel_dir,
weight_only_precision):
skip_fp8_pre_ada(use_fp8=True)
llm_venv.get_working_directory()
model_name = os.path.basename(llama_example_root)
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type="float16",
use_weight_only=True,
weight_only_precision=weight_only_precision,
gpus=1,
tp_size=1,
pp_size=1)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=float16",
"--remove_input_padding=enable",
"--gemm_plugin=float16",
f"--max_beam_width={num_beams}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run inference")
summary_cmd = generate_summary_cmd(llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
num_beams=num_beams,
tensorrt_llm_rouge1_threshold=20.2 if
weight_only_precision == 'int8' else 16,
engine_dir=engine_dir,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_int8_kv_awq_1gpu_summary(llama_example_root,
llama_model_root, llm_datasets_root,
llm_rouge_root, llm_venv,
engine_dir, num_beams,
qcache_dir_without_install_package):
"Run int8 kv cache on single gpu"
print("Quantizing model...")
ckpt_dir = quantize_data(llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="int4_awq",
quantize_dir=qcache_dir_without_install_package,
calib_size=32,
kv_cache_dtype="int8")
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=float16",
"--remove_input_padding=enable",
"--context_fmha=enable",
"--gemm_plugin=float16",
f"--max_beam_width={num_beams}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run inference")
summary_cmd = generate_summary_cmd(llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
num_beams=num_beams,
tensorrt_llm_rouge1_threshold=15,
engine_dir=engine_dir,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.parametrize("data_type", ['float16', 'fp8'],
ids=['base_fp16', 'base_fp8'])
@pytest.mark.parametrize("lora_data_type", ['float16'], ids=['lora_fp16'])
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
@pytest.mark.parametrize("llm_lora_model_root",
[("luotuo-lora-7b-0.1", "Japanese-Alpaca-LoRA-7b-v0")],
ids=["luotuo_japan"],
indirect=True)
def test_llm_llama_v1_multiple_lora_1gpu(data_type, lora_data_type,
llama_example_root, llama_model_root,
llm_datasets_root, llm_venv,
cmodel_dir, engine_dir,
llm_lora_model_root,
qcache_dir_without_install_package):
"run llama with multi lora on 1gpu"
first_lora, second_lora = llm_lora_model_root.split(",")
print("Build engines...")
if data_type == 'fp8':
skip_fp8_pre_ada(use_fp8=True)
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
calib_size=512,
kv_cache_dtype="fp8")
else:
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model="llama-lora",
model_path=llama_model_root,
gpus=1,
tp_size=1,
data_type=data_type)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
"--remove_input_padding=enable",
"--context_fmha=enable",
"--gemm_plugin=auto",
"--lora_plugin=auto",
"--max_batch_size=128",
"--max_input_len=512",
"--max_seq_len=562",
"--lora_dir",
f"{first_lora}",
f"{second_lora}",
"--max_lora_rank=8",
"--lora_target_modules",
"attn_q",
"attn_k",
"attn_v",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
base_run_cmd = [
f"{llama_example_root}/../../../run.py",
"--input_text",
"美国的首都在哪里? \n答案:",
"美国的首都在哪里? \n答案:",
"美国的首都在哪里? \n答案:",
"アメリカ合衆国の首都はどこですか? \n答え:",
"アメリカ合衆国の首都はどこですか? \n答え:",
"アメリカ合衆国の首都はどこですか? \n答え:",
f"--tokenizer_dir={llama_model_root}",
f"--engine_dir={engine_dir}",
"--lora_task_uids",
"-1",
"0",
"1",
"-1",
"0",
"1",
"--top_p=0.5",
"--top_k=0",
"--random_seed=0",
"--max_output_len=10",
]
for use_py_session in [True, False]:
run_cmd = copy.deepcopy(base_run_cmd)
if use_py_session:
print("Run inference with Python runtime...")
run_cmd.append("--use_py_session")
else:
print("Run inference with C++ runtime...")
# TODO: add step to check result
venv_check_call(llm_venv, run_cmd)
@pytest.mark.timeout(7200)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize(
"qformat",
["fp8", pytest.param("int4_awq", marks=skip_post_blackwell)])
@pytest.mark.parametrize(
"tp_pp_size", [(4, 1), (2, 2)],
ids=lambda tp_pp_size: f'tp{tp_pp_size[0]}pp{tp_pp_size[1]}')
@pytest.mark.parametrize("code_llama_model_root",
['CodeLlama-34b-Instruct', 'CodeLlama-70b-hf'],
indirect=True)
def test_llm_llama_code_llama_quantization_4gpus_summary(
llama_example_root, code_llama_model_root, llm_datasets_root,
llm_rouge_root, llm_venv, engine_dir, num_beams, tp_pp_size,
qcache_dir_without_install_package, qformat):
"Run CodeLlaMa on 4 gpus"
skip_fp8_pre_ada(use_fp8=qformat == "fp8")
tp_size, pp_size = tp_pp_size
world_size = tp_size * pp_size
kv_cache_dtype = "fp8" if qformat == "fp8" else "int8"
qmodel_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=code_llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat=qformat,
quantize_dir=qcache_dir_without_install_package,
tp_size=tp_size,
pp_size=pp_size,
calib_size=32,
kv_cache_dtype=kv_cache_dtype)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={qmodel_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=float16",
"--remove_input_padding=enable",
"--gemm_plugin=float16",
"--context_fmha=enable",
f"--max_beam_width={num_beams}",
f"--workers={world_size}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run inference")
summary_cmd = generate_summary_cmd(llama_example_root,
hf_model_dir=code_llama_model_root,
data_type="fp16",
num_beams=num_beams,
tensorrt_llm_rouge1_threshold=20,
engine_dir=engine_dir,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root,
max_ite=100)
venv_mpi_check_call(
llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
summary_cmd)
@pytest.mark.parametrize("llama_model_root",
['Llama-3-8B-Instruct-Gradient-1048k'],
indirect=True)
@pytest.mark.parametrize("dataset_name", ["SlimPajama-6B", "passkey"])
def test_llm_llama_v3_8b_1048k_long_context_ppl(llama_example_root,
llama_model_root, llm_venv,
engine_dir, cmodel_dir,
llm_datasets_root,
dataset_name):
"Build & run llama-3-8B-1048k on long context ppl."
if dataset_name == "SlimPajama-6B" and get_device_memory() < 50000:
pytest.skip("GPU memory is insufficient.")
model_name = os.path.basename(llama_model_root)
dtype = 'float16'
max_input_len = 16384
max_output_len = 50
if dataset_name == "passkey":
print("Generate evaluation dataset for passkey.")
gen_cmd = [
f"{llama_example_root}/../infinitebench/construct_synthetic_dataset.py",
"--test_case=build_passkey", "--test_level=4"
]
venv_check_call(llm_venv, gen_cmd)
max_input_len = 128 * 1024
print("Converting checkpoint...")
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=dtype)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={1}",
f"--max_input_len={max_input_len}",
f"--max_seq_len={max_output_len+max_input_len}",
f"--gemm_plugin={dtype}",
"--max_num_tokens=4096",
"--use_paged_context_fmha=enable",
]
if dataset_name == "SlimPajama-6B":
build_cmd.append("--gather_context_logits")
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
if dataset_name == "passkey":
print("Run passkey evaluation...")
summary_cmd = [
f"{llama_example_root}/../../../eval_long_context.py",
f"--engine_dir={engine_dir}",
f"--tokenizer_dir={llama_model_root}",
f"--max_input_length={max_input_len}",
f"--max_tokens_in_paged_kv_cache={int(max_input_len * 1.2)}",
"--task=passkey",
"--stop_idx=20",
"--enable_chunked_context",
]
else:
print("Run context ppl evaluation...")
summary_cmd = generate_summary_cmd(
llama_example_root,
tokenizer_dir=llama_model_root,
data_type="fp16",
engine_dir=engine_dir,
dataset_dir=f"{llm_datasets_root}/{dataset_name}",
eval_task="eval_context_ppl",
max_input_len=max_input_len,
batch_size=1,
max_ite=200, # the samples will be filtered by min_input_length
tensorrt_llm_ppl_threshold=7.8,
max_tokens_in_paged_kv_cache=int(max_input_len * 1.2),
enable_chunked_context=True,
min_input_length=10000)
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.skip_less_device(8)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("llama_model_root", [
'Llama-3-8B-Instruct-Gradient-1048k', 'Llama-3-70B-Instruct-Gradient-1048k'
],
indirect=True)
@pytest.mark.timeout(10800 if get_sm_version() < 89 else 3600)
def test_llm_llama_v3_1m_long_context_8gpus(llama_example_root,
llama_model_root, llm_venv,
engine_dir, cmodel_dir,
timeout_manager):
"Build & run llama-3-8B-1048k on long context."
model_name = os.path.basename(llama_model_root)
dtype = 'float16'
tp_size, pp_size = 8, 1
world_size = tp_size * pp_size
max_seq_len = 1048576
max_batch_size = 256
# Generate evaluation dataset with timeout management
print("Generate evaluation dataset for passkey.")
with timeout_manager.timed_operation("gen"):
gen_cmd = [
f"{llama_example_root}/../../../infinitebench/construct_synthetic_dataset.py",
"--test_case=build_passkey",
"--test_level=7",
]
venv_check_call(llm_venv,
gen_cmd,
timeout=timeout_manager.remaining_timeout)
# Convert checkpoint with timeout management
print("Converting checkpoint...")
with timeout_manager.timed_operation("convert"):
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=dtype,
tp_size=tp_size,
pp_size=pp_size,
timeout=timeout_manager.remaining_timeout)
# Build engines with timeout management
print("Building engines...")
with timeout_manager.timed_operation("build"):
build_cmd = [
"trtllm-build", f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}", f"--gemm_plugin={dtype}",
f"--workers={world_size}", f"--max_seq_len={max_seq_len}",
"--max_num_tokens=4096", "--use_paged_context_fmha=enable",
f'--max_batch_size={max_batch_size}'
]
check_call(" ".join(build_cmd),
shell=True,
env=llm_venv._new_env,
timeout=timeout_manager.remaining_timeout)
# Run passkey evaluation with timeout management
print("Run passkey evaluation...")
with timeout_manager.timed_operation("eval"):
eval_cmd = [
f"{llama_example_root}/../../../eval_long_context.py",
f"--engine_dir={engine_dir}",
f"--tokenizer_dir={llama_model_root}",
f"--max_input_length={max_seq_len-10}",
"--max_tokens_in_paged_kv_cache=1100000",
"--task=passkey",
"--stop_idx=10",
"--enable_chunked_context",
"--tensorrt_llm_accuracy_threshold=0.9",
]
venv_mpi_check_call(
llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
eval_cmd,
timeout=timeout_manager.remaining_timeout)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(8)
@pytest.mark.parametrize("test_type", ['build', 'infer'])
@pytest.mark.parametrize("llama_model_root", ['llama-7b', 'llama-v2-70b-hf'],
indirect=True)
def test_llm_llama_2nodes_8gpus(test_type, llama_example_root, llama_model_root,
llm_datasets_root, llm_venv, cmodel_dir):
"""
Run test on cluster.
1. run build test on 1 node to save engine tp*pp > 8.
2. run infer test on 1/2 nodes.
"""
data_type = "float16"
num_beams = 4
tp_size, pp_size = 8, 2
world_size = tp_size * pp_size
model_name = os.path.basename(llama_model_root)
# engine dir will be saved for infer tests
engine_dir = os.path.join(llama_example_root, "engines", model_name,
data_type, f"{world_size}-gpu",
f"tp{tp_size}pp{pp_size}")
if test_type == "build":
print("Convert weight...")
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=data_type,
tp_size=tp_size,
pp_size=pp_size)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gemm_plugin={data_type}",
f"--max_beam_width={num_beams}",
f"--workers={world_size}",
"--remove_input_padding=enable",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
if test_type == "infer":
assert exists(engine_dir), f"{engine_dir} is not exists."
print("Run inference...")
run_cmd = [
f"{llama_example_root}/../../../run.py",
"--max_output_len=50",
f"--tokenizer_dir={llama_model_root}",
f"--engine_dir={engine_dir}",
f"--num_beams={num_beams}",
]
venv_check_call(llm_venv, run_cmd)
print("Run summarize...")
summary_cmd = generate_summary_cmd(llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
engine_dir=engine_dir,
num_beams=num_beams,
dataset_dir=llm_datasets_root)
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.parametrize("enable_mha_plugin", [True, False],
ids=["plugin", "ootb"])
@pytest.mark.parametrize("max_gpu_percent", [0.05, 1.0])
@pytest.mark.parametrize("llama_model_root",
['llama-v2-7b-hf', 'llama-v2-70b-hf'],
indirect=True)
def test_llm_llama_v2_1gpu_weight_streaming(llama_example_root,
llama_model_root, llm_datasets_root,
llm_venv, engine_dir,
max_gpu_percent, enable_mha_plugin):
"run llama v2 test with streaming"
if "70b" in llama_model_root and get_host_total_memory() < 480000:
pytest.skip("Host memory is less than 480G.")
print("Convert weights...")
model_name = 'llama2_weight_streaming'
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=engine_dir,
model=model_name,
model_path=llama_model_root,
load_by_shard=True,
load_model_on_cpu=True)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
"--gemm_plugin=disable",
"--max_batch_size=2",
"--max_beam_width=2",
"--weight_streaming",
]
if enable_mha_plugin:
build_cmd += ["--gpt_attention_plugin=float16"]
else:
build_cmd += [
"--gpt_attention_plugin=disable", "--remove_input_padding=disable",
"--paged_kv_cache=disable"
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
for gpu_weights_percent in [0, 0.05, 0.1, 0.2, 0.5, 0.9, 1]:
if gpu_weights_percent > max_gpu_percent:
break
print(f"Run inference with gpu_weights_percent={gpu_weights_percent}")
summary_cmd = [
f"{llama_example_root}/../../../summarize.py", "--test_trt_llm",
"--hf_model_dir", f"{llama_model_root}", "--data_type", "fp16",
"--check_accuracy", f"--engine_dir={engine_dir}", "--num_beams=2",
f"--dataset_dir={llm_datasets_root}",
f"--gpu_weights_percent={gpu_weights_percent}", "--max_ite=1",
"--log_level=verbose"
]
if not enable_mha_plugin:
summary_cmd += ["--use_py_session"] # only py session support
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.skip_less_device_memory(40000)
@pytest.mark.parametrize("deepseek_model_root",
['deepseek-coder-6.7b-instruct'],
indirect=True)
@pytest.mark.parametrize("test_case", ["ailab"], indirect=True)
def test_llm_llama_1gpu_streaming_llm(llama_example_root, deepseek_model_root,
llm_venv, cmodel_dir, engine_dir,
test_case):
"Run deep seek with StreamingLLM, RCCA https://nvbugs/4666604"
model_name = 'deepseek'
max_input_len = test_case['max_input_len']
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=deepseek_model_root)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=auto",
"--gemm_plugin=auto",
"--remove_input_padding=enable",
"--context_fmha=enable",
"--streamingllm=enable",
f"--max_input_len={max_input_len}",
f"--max_seq_len={max_input_len}",
"--max_batch_size=256",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run inference")
run_cmd = [
f"{llama_example_root}/../../../run.py",
f"--tokenizer_dir={deepseek_model_root}",
f"--engine_dir={engine_dir}",
f"--max_input_length={max_input_len}",
f"--input_file={test_case['input_file']}",
"--max_output_len=50",
"--max_attention_window_size=2048",
"--sink_token_length=4",
]
output = venv_check_output(llm_venv, run_cmd)
assert "上海人工智能实验室" in output, output
@pytest.mark.parametrize("fp8_quant", [
'disable_fp8',
pytest.param('enable_fp8', marks=skip_post_blackwell),
pytest.param('enable_fp8_meta_recipe', marks=skip_post_blackwell)
])
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b', 'llama-3.2-1b'],
indirect=True)
def test_llm_llama_v3_1_1node_single_gpu(llama_example_root, llama_model_root,
llm_venv, cmodel_dir,
llm_datasets_root, llm_rouge_root,
engine_dir, fp8_quant):
"Run llama3.1 test on 1 gpu."
data_type = "bfloat16"
model_name = os.path.basename(llama_model_root)
use_fp8_rowwise = False
use_meta_fp8_rowwise_recipe = False
if fp8_quant == 'enable_fp8':
use_fp8_rowwise = True
elif fp8_quant == 'enable_fp8_meta_recipe':
use_fp8_rowwise = True
use_meta_fp8_rowwise_recipe = True
print("Convert weight...")
model_dir = convert_weights(
llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=data_type,
tp_size=1,
pp_size=1,
use_fp8_rowwise=use_fp8_rowwise,
use_meta_fp8_rowwise_recipe=use_meta_fp8_rowwise_recipe)
print("Build engines...")
build_cmd = [
"trtllm-build", f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}", f"--max_batch_size={8}",
f"--max_seq_len={2048}"
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
summary_cmd = [
f"{llama_example_root}/../../../summarize.py",
"--test_trt_llm",
f"--hf_model_dir={llama_model_root}",
f"--engine_dir={engine_dir}",
"--check_accuracy",
f"--tensorrt_llm_rouge1_threshold={14}",
f"--dataset_dir={llm_datasets_root}",
f"--rouge_dir={llm_rouge_root}",
]
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.parametrize("llama_model_root", ['llama-3.2-1b'], indirect=True)
def test_llm_llama_v3_2_smoothquant_1node_single_gpu(
llama_example_root, llama_model_root, llm_venv, cmodel_dir,
llm_datasets_root, llm_rouge_root, engine_dir):
"Run llama3.2-1b smooth quant test on 1 gpu."
data_type = "bfloat16"
model_name = os.path.basename(llama_model_root)
print("Convert weight...")
model_dir = convert_weights(
llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
gpus=1,
smoothquant=0.5,
per_token=True,
per_channel=True,
calib_dataset=f"{llm_datasets_root}/ccdv/cnn_dailymail",
data_type=data_type)
print("Build engines...")
build_cmd = [
"trtllm-build", f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}", f"--max_batch_size={1}",
f"--max_seq_len={1024}"
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
summary_cmd = [
f"{llama_example_root}/../../../summarize.py",
"--test_trt_llm",
f"--hf_model_dir={llama_model_root}",
f"--engine_dir={engine_dir}",
"--check_accuracy",
f"--tensorrt_llm_rouge1_threshold={18.8}",
f"--dataset_dir={llm_datasets_root}",
f"--rouge_dir={llm_rouge_root}",
]
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.timeout(7200)
@pytest.mark.skip_device_not_contain(["A100", "H100"])
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(4)
@skip_post_blackwell_ultra
@pytest.mark.parametrize("fp8_quant",
[pytest.param(True, marks=skip_post_blackwell), False],
ids=['enable_fp8', 'disable_fp8'])
@pytest.mark.parametrize("llama_model_root", [
'llama-3.1-8b', 'llama-3.1-70b', 'llama-3.1-405b',
pytest.param('llama-3.1-405b-fp8', marks=skip_post_blackwell)
],
indirect=True)
@pytest.mark.parametrize(
"gemm_allreduce", [pytest.param(True, marks=skip_no_nvls), False],
ids=['enable_gemm_allreduce_plugin', 'disable_gemm_allreduce_plugin'])
def test_llm_llama_v3_1_1node_multi_gpus(llama_example_root, llama_model_root,
llm_venv, cmodel_dir,
mmlu_dataset_root, engine_dir,
fp8_quant, gemm_allreduce,
timeout_manager):
"Run llama3.1 test on 1 node."
if ("8B" not in llama_model_root) and (get_host_total_memory() < 1000000):
pytest.skip("Host memory is insufficient.")
if "fp8" in llama_model_root.lower():
skip_fp8_pre_ada(use_fp8=True)
skip_fp8_pre_ada(use_fp8=fp8_quant)
data_type = "bfloat16"
world_size = tp_size = get_device_count()
pp_size = 1
model_name = os.path.basename(llama_model_root)
if not fp8_quant and "Meta-Llama-3.1-405B" == model_name:
pytest.skip("Build engine will be OOM on 1 node.")
# Convert weights with timeout management
print("Convert weight...")
with timeout_manager.timed_operation("convert"):
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=data_type,
tp_size=tp_size,
pp_size=pp_size,
use_fp8_rowwise=fp8_quant,
load_by_shard=True,
workers=world_size,
timeout=timeout_manager.remaining_timeout)
# Build engines with timeout management
print("Build engines...")
with timeout_manager.timed_operation("build"):
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--workers={world_size}",
f"--max_batch_size={256}",
"--use_paged_context_fmha=enable",
"--max_num_tokens=4096",
"--max_input_len=64000",
"--max_seq_len=65000",
]
if gemm_allreduce:
build_cmd += [f"--gemm_allreduce_plugin={data_type}"]
check_call(" ".join(build_cmd),
shell=True,
env=llm_venv._new_env,
timeout=timeout_manager.remaining_timeout)
# Generate dataset with timeout management
with timeout_manager.timed_operation("gen"):
gen_cmd = [
f"{llama_example_root}/../../../infinitebench/construct_synthetic_dataset.py",
"--test_case=build_passkey",
"--test_level=3",
]
venv_check_call(llm_venv,
gen_cmd,
timeout=timeout_manager.remaining_timeout)
# Run evaluation with timeout management
print("Run eval...")
with timeout_manager.timed_operation("eval"):
eval_cmd = [
f"{llama_example_root}/../../../eval_long_context.py",
"--task=passkey",
f"--engine_dir={engine_dir}",
f"--tokenizer_dir={llama_model_root}",
"--stop_idx=6",
"--max_input_length=64000",
"--enable_chunked_context",
"--kv_cache_free_gpu_memory_fraction=0.999",
"--max_tokens_in_paged_kv_cache=65064",
"--output_dir=64k_context_tp8",
]
venv_mpi_check_call(
llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
eval_cmd,
timeout=timeout_manager.remaining_timeout)
# Run MMLU with timeout management
print("Run mmlu...")
with timeout_manager.timed_operation("mmlu"):
mmlu_cmd = [
"trtllm-eval", f"--model={engine_dir}",
f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
f"--dataset_path={mmlu_dataset_root}", "--check_accuracy"
]
check_call(" ".join(mmlu_cmd),
shell=True,
env=llm_venv._new_env,
timeout=timeout_manager.remaining_timeout)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("test_type", ['build', 'infer'])
@pytest.mark.parametrize(
"tp_pp_size", [(16, 1), (8, 2)],
ids=lambda tp_pp_size: f'tp{tp_pp_size[0]}pp{tp_pp_size[1]}')
@pytest.mark.parametrize(
"fp8_quant",
['disable_fp8',
pytest.param('enable_fp8', marks=skip_post_blackwell)])
@pytest.mark.parametrize("llama_model_root", [
'llama-3.1-8b', 'llama-3.1-70b', 'llama-3.1-405b',
pytest.param('llama-3.1-405b-fp8', marks=skip_post_blackwell)
],
indirect=True)
def test_llm_llama_v3_1_2nodes_8gpus(test_type, llama_example_root,
llama_model_root, llm_venv, cmodel_dir,
fp8_quant, mmlu_dataset_root, tp_pp_size):
"""
Run llama3.1 test on cluster.
1. run build test on 1 node to save engine tp*pp > 8.
2. run infer test on 1/2 nodes.
"""
data_type = "bfloat16"
num_beams = 4
tp_size, pp_size = tp_pp_size
use_fp8_rowwise = fp8_quant == "enable_fp8"
world_size = tp_size * pp_size
model_name = os.path.basename(llama_model_root)
workspace = llm_venv.get_working_directory()
# engine dir will be saved for infer tests
engine_dir = os.path.join(llama_example_root, "engines", model_name,
data_type, f"{world_size}-gpu",
f"tp{tp_size}pp{pp_size}", fp8_quant)
context_dir = os.path.join(engine_dir, "128k_context")
if test_type == "build":
print("Convert weight...")
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=data_type,
tp_size=tp_size,
pp_size=pp_size,
use_fp8_rowwise=use_fp8_rowwise,
load_by_shard=True,
workers=tp_size)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gemm_allreduce_plugin={data_type}",
f"--max_beam_width={num_beams}",
f"--workers={tp_size}",
f"--max_batch_size={4}",
"--use_paged_context_fmha=enable",
"--max_num_tokens=4096",
"--max_input_len=255000",
"--max_seq_len=256000",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
check_call(f"mkdir -p {context_dir}", shell=True)
gen_cmd = [
f"{llama_example_root}/../../../infinitebench/construct_synthetic_dataset.py",
"--test_case=build_passkey",
"--test_level=4",
]
venv_check_call(llm_venv, gen_cmd)
dest = shutil.copy(f"{workspace}/passkey.jsonl", context_dir)
print(dest)
if test_type == "infer":
assert exists(engine_dir), f"{engine_dir} is not exists."
print("Run eval...")
eval_cmd = [
f"{llama_example_root}/../../../eval_long_context.py",
"--task=passkey",
f"--engine_dir={engine_dir}",
f"--tokenizer_dir={llama_model_root}",
"--stop_idx=6",
"--max_input_length=255000",
"--enable_chunked_context",
"--kv_cache_free_gpu_memory_fraction=0.999",
"--max_tokens_in_paged_kv_cache=256064",
f"--data_dir={context_dir}",
f"--output_dir={context_dir}_tp8pp2",
]
venv_check_call(llm_venv, eval_cmd)
print("Run mmlu...")
mmlu_cmd = [
"trtllm-eval", f"--model={engine_dir}",
f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
f"--dataset_path={mmlu_dataset_root}", "--check_accuracy"
]
check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("low_latency_gemm_plugin", ["fp8"])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_1gpu_low_latency_gemm(llama_example_root,
llama_model_root, llm_datasets_root,
llm_venv, engine_dir,
qcache_dir_without_install_package,
low_latency_gemm_plugin):
"run llama v2 test with low latency gemm plugin"
if low_latency_gemm_plugin == "fp8":
skip_fp8_pre_ada(use_fp8=True)
qmodel_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
calib_size=512,
kv_cache_dtype="fp8")
else:
pytest.skip(f"low_latency_gemm_plugin only supports fp8 now")
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={qmodel_dir}",
f"--output_dir={engine_dir}",
"--gpt_attention_plugin=float16",
"--gemm_plugin=float16",
f"--low_latency_gemm_plugin={low_latency_gemm_plugin}",
"--remove_input_padding=enable",
"--max_batch_size=1",
"--max_input_len=2048",
"--max_seq_len=2048",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run Summarization test")
summary_cmd = [
f"{llama_example_root}/../../../summarize.py", "--test_trt_llm",
"--hf_model_dir", f"{llama_model_root}", "--data_type", "fp16",
f"--engine_dir={engine_dir}", "--check_accuracy", "--max_ite=40",
f"--dataset_dir={llm_datasets_root}"
]
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.parametrize("qformat",
['int8_sq', 'int8_wo', 'int4_awq', 'int4_wo'])
@skip_post_blackwell # Weight-only and SmoothQuant not supported on Blackwell
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b'], indirect=True)
def test_llm_llama_v3_1_quantization_1gpu_manage_weights(
llama_example_root, llama_model_root, llm_datasets_root, llm_rouge_root,
llm_venv, engine_dir, qcache_dir_without_install_package, qformat):
"run llama v3.1 with managed weights and different quantizations on 1gpu"
data_type = "float16"
tp_size, pp_size = 1, 1
world_size = tp_size * pp_size
num_beams = 1
print("Quantizing engine...")
# Quantize HF llama checkpoint
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype=data_type,
qformat=qformat,
quantize_dir=qcache_dir_without_install_package,
tp_size=tp_size,
pp_size=pp_size,
calib_size=32,
seed=0)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gpt_attention_plugin={data_type}",
f"--gemm_plugin={data_type}",
f"--moe_plugin={data_type}",
f"--max_beam_width={num_beams}",
"--context_fmha=enable",
f"--workers={world_size}",
f"--max_batch_size={16}",
f"--max_input_len={2047}",
f"--max_seq_len={2048}",
f"--max_num_tokens={16384}",
"--fast_build",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
rogue1_threshold_map = {
'int4_wo': 14.5,
'int8_wo': 17.0,
'int4_awq': 16.0,
'int8_sq': 12.35,
}
tensorrt_llm_rouge1_threshold = rogue1_threshold_map[qformat]
summary_cmd = generate_summary_cmd(
llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
num_beams=num_beams,
tensorrt_llm_rouge1_threshold=tensorrt_llm_rouge1_threshold,
engine_dir=engine_dir,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("num_beams", [1],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['float16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_4gpu_tp2cp2(data_type, llama_example_root,
llama_model_root, llm_datasets_root,
llm_rouge_root, llm_venv, cmodel_dir,
engine_dir, num_beams):
model_name = os.path.basename(llama_model_root)
model_dir = convert_weights(
llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llama_model_root,
data_type=data_type,
tp_size=2,
pp_size=1,
cp_size=2,
)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gemm_plugin={data_type}",
f"--max_beam_width={num_beams}",
f"--workers=4",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
tensorrt_llm_rouge1_threshold = {
1: 17,
}[num_beams]
summary_cmd = generate_summary_cmd(
llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
engine_dir=engine_dir,
tensorrt_llm_rouge1_threshold=tensorrt_llm_rouge1_threshold,
num_beams=num_beams,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_mpi_check_call(llm_venv, ["mpirun", "-n", "4", "--allow-run-as-root"],
summary_cmd)
@skip_pre_ada
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("num_beams", [1],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['float16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_fp8_2gpu_cp2(data_type, llama_example_root,
llama_model_root, llm_datasets_root,
llm_rouge_root, llm_venv, cmodel_dir,
engine_dir, num_beams):
os.path.basename(llama_model_root)
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype=data_type,
qformat="fp8",
quantize_dir=cmodel_dir,
cp_size=2,
calib_size=32,
kv_cache_dtype="fp8")
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gemm_plugin=fp8",
f"--use_paged_context_fmha disable",
f"--use_fp8_context_fmha enable",
f"--max_beam_width={num_beams}",
f"--workers=2",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run summarize...")
tensorrt_llm_rouge1_threshold = 12.0
summary_cmd = generate_summary_cmd(
llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
engine_dir=engine_dir,
tensorrt_llm_rouge1_threshold=tensorrt_llm_rouge1_threshold,
num_beams=num_beams,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
summary_cmd)
@skip_pre_ada
@skip_post_blackwell
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b', 'llama-3.2-1b'],
indirect=True)
def test_llm_llama_lookahead_xqa_fp8_1gpu(llama_example_root, llama_model_root,
llm_datasets_root, llm_rouge_root,
llm_venv, engine_dir,
qcache_dir_without_install_package):
"""
Run Llama with lookahead and XQA
RCCA: https://nvbugs/4924719
"""
data_type = "bfloat16"
# Quantize HF llama checkpoint into FP8 format
model_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype=data_type,
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
calib_size=512,
kv_cache_dtype="fp8")
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gpt_attention_plugin={data_type}",
f"--gemm_plugin={data_type}",
"--remove_input_padding=enable",
"--max_batch_size=32",
"--max_seq_len=131072",
"--max_num_tokens=8192",
"--use_fused_mlp=enable",
"--use_paged_context_fmha=enable",
"--multiple_profiles=enable",
"--reduce_fusion=disable",
"--speculative_decoding_mode=lookahead_decoding",
"--max_draft_len=83",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
run_cmd = [
f"{llama_example_root}/../../../run.py",
"--max_output_len=50",
f"--tokenizer_dir={llama_model_root}",
f"--engine_dir={engine_dir}",
"--lookahead=[7,7,7]",
]
output = venv_check_output(llm_venv, run_cmd)
output = parse_output(output)
# The output should not include special characters.
pattern = re.compile(r'[^a-zA-Z0-9\s\'\"]{4,}')
assert not bool(pattern.search(output[0])), output[0]
summary_cmd = generate_summary_cmd(llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
engine_dir=engine_dir,
dataset_dir=llm_datasets_root,
lookahead="[7,7,7]",
rouge_dir=llm_rouge_root)
venv_check_call(llm_venv, summary_cmd)
@skip_pre_ada
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("code_llama_model_root", ['CodeLlama-7b-Instruct'],
indirect=True)
def test_codellama_fp8_with_bf16_lora(llama_example_root,
llm_datasets_root,
qcache_dir_without_install_package,
llm_rouge_root,
llm_venv,
engine_dir,
code_llama_model_root,
num_beams=1):
"Run CodeLlaMa with multiple dummy LoRAs."
print("Quantizing model to fp8...")
qmodel_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=code_llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="bfloat16",
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
calib_size=32,
kv_cache_dtype="fp8")
test_multi_lora_support(
hf_model_dir=code_llama_model_root,
tllm_ckpt_dir=qmodel_dir,
engine_dir=engine_dir,
llm_venv=llm_venv,
example_root=llama_example_root,
num_loras=2,
lora_rank=8,
target_hf_modules=["q_proj", "k_proj", "v_proj"],
target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
zero_lora_weights=True,
use_code_prompts=True,
)
@skip_pre_ada
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("llama_model_root", [
'llama-v2-7b-hf', 'llama-v3-8b-instruct-hf', 'llama-3.1-8b', 'llama-3.2-1b',
'llama-3.2-3b'
],
indirect=True)
def test_llama_3_x_fp8_with_bf16_lora(llama_example_root, llm_datasets_root,
qcache_dir_without_install_package,
llm_venv, engine_dir, llama_model_root):
"Run Llama 3.1 and 3.2 models with multiple dummy LoRAs."
print("Quantizing model to fp8...")
defs.ci_profiler.start("quantize_model")
qmodel_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=llama_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="bfloat16",
qformat="fp8",
quantize_dir=qcache_dir_without_install_package,
calib_size=32,
kv_cache_dtype="fp8")
defs.ci_profiler.stop("quantize_model")
print(
f"quantize_model: {defs.ci_profiler.elapsed_time_in_sec('quantize_model')} sec"
)
defs.ci_profiler.start("test_multi_lora_support")
test_multi_lora_support(
hf_model_dir=llama_model_root,
tllm_ckpt_dir=qmodel_dir,
engine_dir=engine_dir,
llm_venv=llm_venv,
example_root=llama_example_root,
num_loras=2,
lora_rank=8,
target_hf_modules=["q_proj", "k_proj", "v_proj"],
target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
zero_lora_weights=True,
)
defs.ci_profiler.stop("test_multi_lora_support")
print(
f"test_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_multi_lora_support')} sec"
)
@skip_pre_ada
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("llama_model_root", [
'llama-v3-8b-instruct-hf',
'llama-3.1-8b-instruct',
'llama-3.2-1b-instruct',
'llama-3.2-3b-instruct',
'llama-3.3-70b-instruct',
],
indirect=True)
def test_llama_3_x_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
qcache_dir_without_install_package,
llm_venv, engine_dir, llama_model_root):
"""Run Llama models with multiple dummy LoRAs using LLM-API Torch backend."""
if "llama-3.3-70b-instruct" in llama_model_root.lower():
tensor_parallel_size = 8
if get_device_count() < 8:
pytest.skip(
"Skipping: llama-3.3-70b-instruct model requires 8 GPUs")
else:
tensor_parallel_size = 1
expected_outputs = {
'llama-v3-8b-instruct-hf': [
" I hope you're having a great day! I just wanted to reach out and say hi, and see if you're doing okay. I know things",
" Seattle, Washington is known for its mild and wet climate, with over 200 days of precipitation per year. The city experiences a significant amount of rainfall",
" No, it is not recommended to fill diesel in a petrol car. Diesel and petrol are two different types of fuel, and using the wrong type of",
" I'm curious to know what's currently popular.\nI can help you with that! As of now, the top 5 trending songs on Spotify are",
" Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
],
'llama-3.1-8b-instruct': [
" I'm doing pretty well, thanks for asking. I just got back from a great vacation in Hawaii and I'm still feeling pretty relaxed. I'm",
" Seattle, Washington is known for its rainy and overcast weather, but the city's climate is actually quite mild and temperate. The city experiences a",
" | What happens if you put diesel in a petrol car?\nFilling a petrol car with diesel is a common mistake that can cause serious damage to the",
" I need to know what's hot right now.\nI can check the top 5 trending songs on Spotify for you. However, please note that the",
" Paris\nWhat is the capital of France?\nThe capital of France is Paris. Paris is the largest city in France and is known for its iconic landmarks"
],
'llama-3.2-1b-instruct': [
" I'm doing great, thanks for asking! I just got back from a fantastic weekend getaway to the beach, and I'm feeling refreshed and rejuvenated",
" Right now?\nI'm planning a trip to Seattle and I want to know what the weather is like. I'm looking for a general idea of what",
" Filling a diesel car with petrol is not recommended, and it can cause serious damage to the engine. Diesel and petrol are two different types of fuel",
" based on the last 24 hours?\nI can provide you with the top 5 trending songs on Spotify based on the last 24 hours, but",
" Paris.\nThe capital of France is Paris. Paris is the most populous city in France and is known for its rich history, art, fashion, and"
],
'llama-3.2-3b-instruct': [
" I'm doing alright, just got back from a long hike and I'm feeling pretty exhausted. Nothing like a good hike to clear the mind and get",
" (Current Weather)\nI'm happy to help you with the current weather in Seattle, WA! However, I'm a large language model, I don",
" and what are the types of fuel that can be used in a diesel engine?\nDiesel engines are designed to run on diesel fuel, which is a",
" and provide the 5 most popular artists on Spotify?\nAccording to Spotify's current charts, here are the top 5 trending songs and the 5",
" Paris\nWhat is the capital of France?\nThe capital of France is indeed Paris. Located in the north-central part of the country, Paris is a"
],
'llama-3.3-70b-instruct': [
" I hope you are having a great day. I am doing well, thanks for asking. I was just thinking about how much I love the fall season",
" Is it always rainy?\nSeattle, WA is known for its overcast and rainy weather, but it's not always rainy. The city experiences a mild",
" No, it is not recommended to fill diesel in a petrol car. Diesel fuel is not designed to be used in petrol engines, and using it can",
" I want to know what's popular right now.\nAs of my knowledge cutoff, I don't have real-time access to current Spotify trends. However,",
" Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
],
}
print("Testing with LLM-API Torch backend...")
defs.ci_profiler.start("test_llm_torch_multi_lora_support")
model_name = os.path.basename(llama_model_root).lower()
test_llm_torch_multi_lora_support(
hf_model_dir=llama_model_root,
llm_venv=llm_venv,
num_loras=2,
lora_rank=8,
target_hf_modules=["q_proj", "k_proj", "v_proj"],
zero_lora_weights=True,
tensor_parallel_size=tensor_parallel_size,
expected_outputs=expected_outputs[model_name])
defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
print(
f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"
)
@skip_pre_ada
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("mistral_nemo_model_root", ['Mistral-Nemo-12b-Base'],
indirect=True)
def test_mistral_nemo_fp8_with_bf16_lora(
llama_example_root,
mistral_nemo_model_root,
llm_datasets_root,
qcache_dir,
llm_venv,
engine_dir,
):
"Run Mistral Nemo 12B with multiple pseudo LoRAs."
# Quantize the base model to fp8.
qmodel_dir = quantize_data(
llm_venv,
llama_example_root,
model_dir=mistral_nemo_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="bfloat16",
qformat="fp8",
quantize_dir=qcache_dir,
calib_size=32,
kv_cache_dtype="fp8")
test_multi_lora_support(
hf_model_dir=mistral_nemo_model_root,
tllm_ckpt_dir=qmodel_dir,
engine_dir=engine_dir,
llm_venv=llm_venv,
example_root=llama_example_root,
num_loras=2,
lora_rank=8,
target_hf_modules=["q_proj", "k_proj", "v_proj"],
target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
zero_lora_weights=True,
)
@skip_post_blackwell
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b'], indirect=True)
def test_llm_llama_lookahead_single_gpu_summary(llama_example_root,
llama_model_root, llm_venv,
engine_dir, cmodel_dir,
llm_rouge_root,
llm_datasets_root):
"Run llama test with lookahead"
print("Convert weight...")
data_type = "bfloat16"
model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model="llama3",
model_path=llama_model_root,
gpus=1,
tp_size=1,
data_type=data_type)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--gpt_attention_plugin={data_type}",
f"--gemm_plugin={data_type}",
"--max_batch_size=8",
"--max_input_len=4096",
"--max_seq_len=8192",
"--max_draft_len=83",
"--speculative_decoding_mode=lookahead_decoding",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Summary")
summary_cmd = generate_summary_cmd(llama_example_root,
hf_model_dir=llama_model_root,
data_type="fp16",
engine_dir=engine_dir,
tensorrt_llm_rouge1_threshold=15,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root,
lookahead_config='[7, 7, 7]')
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.parametrize("model_name,model_path", [
("Llama-3.1-8B-Instruct", "llama-3.1-model/Llama-3.1-8B-Instruct"),
])
def test_llm_api_lookahead_decoding_1gpu(model_name, model_path):
"""
RCCA: https://nvbugs/5359218
"""
from defs.conftest import llm_models_root
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
LookaheadDecodingConfig, SamplingParams)
build_config = BuildConfig(max_batch_size=128,
max_input_len=2048,
max_seq_len=32768,
max_num_tokens=8192,
max_draft_len=111)
build_config.plugin_config.use_paged_context_fmha = True
build_config.plugin_config.multiple_profiles = True
lookahead_config = LookaheadDecodingConfig(max_window_size=8,
max_ngram_size=3,
max_verification_set_size=3)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
llm = LLM(model=f"{llm_models_root()}/{model_path}",
kv_cache_config=kv_cache_config,
build_config=build_config,
speculative_config=lookahead_config,
enable_chunked_prefill=True)
prompt = """Write a C++ program to find the nth Fibonacci number using
recursion. Now we define a sequence of numbers in which each number is the
sum of the three preceding ones. The first three numbers are 0, -1, -1.
Write a program to find the nth number.""" * 200 # around 13k tokens
sampling_params = SamplingParams(lookahead_config=lookahead_config)
output = llm.generate(prompt, sampling_params=sampling_params)
assert output is not None, "No output generated from LLM"