TensorRT-LLMs/tests/integration/defs/examples/test_gpt.py
Ivy Zhang d101a6cebc
[https://nvbugs/5410279][test] resubmit timeout refactor (#6337)
Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
2025-08-05 16:39:25 +08:00

1931 lines
74 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Module test_gpt test gpt examples."""
import csv
import os
import re
from pathlib import Path
import pytest
from defs.common import (convert_weights, generate_summary_cmd, parse_mpi_cmd,
parse_output, quantize_data, run_and_check, similar,
similarity_score, test_multi_lora_support,
venv_check_call, venv_check_output,
venv_mpi_check_call, venv_mpi_check_output)
from defs.conftest import (get_device_memory, get_sm_version, skip_fp8_pre_ada,
skip_post_blackwell, skip_pre_ada)
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
INPUT_TEXT_1 = "After Washington had returned to Williamsburg, " + \
"Dinwiddie ordered him to lead a larger force to assist Trent in his work. " + \
"While en route, Washington learned of Trent's retreat. " + \
"Since Tanaghrisson had promised support to the British, " + \
"Washington continued toward Fort Duquesne and met with the Mingo leader. " + \
"Learning of a French scouting party in the area, Washington, " + \
"with Tanaghrisson and his party, surprised the Canadians on May 28 " + \
"in what became known as the Battle of Jumonville Glen. " + \
"They killed many of the Canadians, including their commanding officer, " + \
"Joseph Coulon de Jumonville, whose head was reportedly split open by " + \
"Tanaghrisson with a tomahawk. The historian Fred Anderson suggests that " + \
"Tanaghrisson was acting to gain the support of the British and regain " + \
"authority over his own people. They had been inclined to support the French, " + \
"with whom they had long trading relationships. One of Tanaghrisson's men told " + \
"Contrecoeur that Jumonville had been killed by British musket fire. " + \
"Question: Upon learning of a French scounting party in the area, " + \
"what did Washington do? Answer:"
INPUT_TEXT_2 = "You hold the job title in the Wizarding World of Harry Potter where you " + \
"say random words looking for spells"
@pytest.mark.parametrize("num_beams", [1, 4],
ids=["num_beams_1", "num_beams_4"])
@pytest.mark.parametrize(
"return_all_generated_tokens", [True, False],
ids=["return_all_generated_tokens", "disable_return_all_generated_tokens"])
@pytest.mark.parametrize("batch_size", [1, 3],
ids=["batch_size_1", "batch_size_3"])
def test_streaming_beam(gpt_example_root, llm_venv, llm_gpt2_model_root,
engine_dir, cmodel_dir, num_beams,
return_all_generated_tokens, batch_size):
""" Test the correctness of beam search + streaming versus the outputs of
non-streaming beam search. Both use the cpp runtime.
The num_beams=1 test acts as a test for `return_all_generated_tokens`"""
dtype = 'float16'
output_len = 10
texts = ["want to", "Movies are just", "Soyer was"]
input_text = texts[:batch_size]
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2",
model_path=llm_gpt2_model_root,
data_type=dtype)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
f"--gemm_plugin={dtype}",
f"--max_beam_width={num_beams}",
"--context_fmha=enable",
"--use_paged_context_fmha=enable",
"--paged_kv_cache=enable",
"--remove_input_padding=enable",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Running inference...")
streaming_command = [
f"{gpt_example_root}/../../../run.py", "--no_add_special_tokens",
f"--max_output_len={output_len}", f"--engine_dir={engine_dir}",
f"--tokenizer_dir={llm_gpt2_model_root}", f"--streaming",
f"--streaming_interval=1", f"--num_beams={num_beams}", f"--input_text",
*input_text
]
if return_all_generated_tokens:
streaming_command += ["--return_all_generated_tokens"]
streaming_outputs = venv_check_output(llm_venv, streaming_command)
joined_nonstreamed_outputs = ""
for length_iterator in range(1, output_len + 1):
command = [
f"{gpt_example_root}/../../../run.py", "--no_add_special_tokens",
f"--max_output_len={length_iterator}", f"--engine_dir={engine_dir}",
f"--tokenizer_dir={llm_gpt2_model_root}",
f"--num_beams={num_beams}", f"--input_text", *input_text
]
if return_all_generated_tokens:
command += ["--return_all_generated_tokens"]
non_streaming_output = venv_check_output(llm_venv, command)
joined_nonstreamed_outputs += "Output from command" + str(
command) + "\n" + non_streaming_output
def parse_output(text: str) -> list[str]:
results = []
while True:
match = re.search(
r"Output \[Text \d+ Beam \d+\]: \"([^\"]*)\"\r?\n", text)
if match is None:
break
_, end = match.span()
results.append(match.group(1))
text = text[end:]
return results
print("STREAMING OUTPUT HERE\n\n\n",
streaming_outputs,
"\n\n\n",
sep="----")
print("NON-STREAMING OUTPUT HERE\n\n\n",
joined_nonstreamed_outputs,
"\n\n\n",
sep="----")
parsed_streamed_outputs = parse_output(streaming_outputs)
parsed_nonstreamed_outputs = parse_output(joined_nonstreamed_outputs)
def ordered_subset(s1, s2):
"""
Use this to check if the streamed outputs are an ordered subset of nonstreamed
Streaming can sometimes skip outputs
"""
s2 = iter(s2)
try:
for c in s1:
while next(s2) != c:
pass
else:
return True
except StopIteration:
return False
streaming_is_subset = ordered_subset(parsed_streamed_outputs,
parsed_nonstreamed_outputs)
print("streaming_is_subset ", streaming_is_subset)
assert streaming_is_subset
is_equal = (parsed_streamed_outputs == parsed_nonstreamed_outputs)
print("is_equal", is_equal)
if not is_equal:
print("Differences:")
for streamed, nonstreamed in zip(parsed_streamed_outputs,
parsed_nonstreamed_outputs):
if (streamed != nonstreamed):
print("Streamed:", streamed)
print("Nonstreamed:", nonstreamed)
# streaming can can skip outputs, if the next set of outputs arrive.
# this means that the is_equal flag is currently flaky: https://nvbugspro.nvidia.com/bug/4851644
# assert is_equal
def test_llm_gpt2_kv_cache_1gpu(gpt_example_root, llm_venv, llm_gpt2_model_root,
engine_dir, cmodel_dir):
"gpt2 cases on 1 gpu"
print("Converting checkpoint...")
dtype = 'float16'
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2",
model_path=llm_gpt2_model_root,
data_type=dtype)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
f"--gemm_plugin={dtype}",
"--context_fmha=enable",
"--use_paged_context_fmha=enable",
"--paged_kv_cache=enable",
"--remove_input_padding=enable",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Running inference...")
venv_check_call(llm_venv, [
f"{gpt_example_root}/../../../summarize.py",
f"--engine_dir={engine_dir}",
"--test_hf",
"--batch_size=1",
"--test_trt_llm",
f"--hf_model_dir={llm_gpt2_model_root}",
"--check_accuracy",
"--tensorrt_llm_rouge1_threshold=13.5",
"--no_add_special_tokens",
"--max_tokens_in_paged_kv_cache=1024",
])
venv_check_call(llm_venv, [
f"{gpt_example_root}/../../../summarize.py",
f"--engine_dir={engine_dir}",
"--test_hf",
"--batch_size=1",
"--test_trt_llm",
f"--hf_model_dir={llm_gpt2_model_root}",
"--check_accuracy",
"--tensorrt_llm_rouge1_threshold=13.5",
"--no_add_special_tokens",
"--kv_cache_enable_block_reuse",
"--kv_cache_free_gpu_memory_fraction=0.5",
])
@pytest.mark.parametrize(
"use_attention_plugin", [True, False],
ids=["enable_attention_plugin", "disable_attention_plugin"])
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
def test_llm_gpt2_1gpu(gpt_example_root, llm_venv, llm_gpt2_model_root,
llm_datasets_root, llm_rouge_root, engine_dir,
cmodel_dir, use_attention_plugin, use_gemm_plugin):
"gpt2 cases on 1 gpu"
print("Converting checkpoint...")
dtype = 'float16'
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2",
model_path=llm_gpt2_model_root,
data_type=dtype)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
]
if use_attention_plugin:
build_cmd.extend([f"--gpt_attention_plugin={dtype}"])
else:
build_cmd.extend([
"--gpt_attention_plugin=disable",
"--context_fmha=disable",
"--paged_kv_cache=disable",
"--remove_input_padding=disable",
])
if use_gemm_plugin:
build_cmd.extend([f"--gemm_plugin={dtype}"])
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Running inference...")
venv_check_call(llm_venv, [
f"{gpt_example_root}/../../../summarize.py",
f"--engine_dir={engine_dir}", "--test_hf", "--batch_size=1",
"--test_trt_llm", f"--hf_model_dir={llm_gpt2_model_root}",
"--check_accuracy", "--tensorrt_llm_rouge1_threshold=13.5",
"--no_add_special_tokens", f"--dataset_dir={llm_datasets_root}",
f"--rouge_dir={llm_rouge_root}"
])
if not use_gemm_plugin:
print("Checking embedding sharing...")
# Embedding sharing should be enabled automatically.
# Gpt2 has 124M parameters among which 36.8M are shared between embedding and lm_head.
# If embedding sharing is enabled, the FP16 engine size should be about 248 MB;
# otherwise, the engine size should be about 321.6 MB.
engine_size = os.path.getsize(f"{engine_dir}/rank0.engine") / (1024**2)
assert engine_size < 280
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
@pytest.mark.parametrize("use_py_session", [False, True],
ids=["use_cpp_session", "use_py_session"])
@pytest.mark.parametrize("streaming", [False, True],
ids=["non_streaming", "streaming"])
def test_llm_gpt2_medium_1gpu(gpt_example_root, llm_venv,
llm_gpt2_medium_model_root, cmodel_dir,
engine_dir, use_gemm_plugin, use_py_session,
streaming):
"gpt2-medium build & run"
print("Converting checkpoint...")
dtype = 'float16'
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-medium",
model_path=llm_gpt2_medium_model_root,
data_type=dtype)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
"--paged_kv_cache=enable",
"--remove_input_padding=enable",
]
if use_gemm_plugin:
build_cmd.extend([f"--gemm_plugin={dtype}"])
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
run_cmd = [
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
f"--engine_dir={engine_dir}",
f"--tokenizer_dir={llm_gpt2_medium_model_root}",
"--no_add_special_tokens"
]
if streaming:
run_cmd.append("--streaming")
if use_py_session:
run_cmd.append("--use_py_session")
print("Running inference...")
output = venv_check_output(llm_venv, run_cmd)
valid_outputs = [
"chef before moving to London in the early",
"chef before moving to London in the late",
"chef and eventually became a chef at a",
]
if not streaming:
output = parse_output(output)[0]
assert any([similar(output, expect)
for expect in valid_outputs]), f"output is: {output}"
else:
# Fetch all outputs and expect a monotonically increasing similarity
similarities = []
for suboutput in parse_output(output):
similarities.append(
max([
similarity_score(suboutput, expect)
for expect in valid_outputs
]))
assert (
all(x <= y for x, y in zip(similarities, similarities[1:]))
), f"streaming outputs must have a monotonically increasing similarity score. similarities: {similarities}"
output = parse_output(output)[-1]
assert any([similar(output, expect)
for expect in valid_outputs]), f"output is: {output}"
@pytest.mark.parametrize("use_py_session", [False, True],
ids=["use_cpp_session", "use_py_session"])
@pytest.mark.parametrize("streaming", [False, True],
ids=["non_streaming", "streaming"])
def test_llm_gpt2_medium_bad_words_1gpu(gpt_example_root, llm_venv,
llm_gpt2_medium_model_root, cmodel_dir,
engine_dir, use_py_session, streaming):
"gpt2 build & run"
if use_py_session and streaming:
pytest.skip(
"Streaming with py session does not return complete sequence to reliably check stop words"
)
print("Converting checkpoint...")
dtype = 'float16'
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-medium",
model_path=llm_gpt2_medium_model_root,
data_type=dtype)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
"--paged_kv_cache=enable",
"--remove_input_padding=enable",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
run_cmd = [
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
f"--engine_dir={engine_dir}",
f"--tokenizer_dir={llm_gpt2_medium_model_root}",
"--no_add_special_tokens"
]
if streaming:
run_cmd.append("--streaming")
if use_py_session:
run_cmd.append("--use_py_session")
valid_outputs = [
"chef before moving to the UK in the",
"chef and eventually became a chef at a",
]
bad_words_args = ["--bad_words", " London"]
run_and_check(llm_venv,
run_cmd + bad_words_args,
valid_outputs,
streaming=streaming)
bad_words_args = ["--bad_words", " to London", " irrelevant words"]
run_and_check(llm_venv,
run_cmd + bad_words_args,
valid_outputs,
streaming=streaming)
bad_words_args = ["--bad_words", " irrelevant words", " to London"]
run_and_check(llm_venv,
run_cmd + bad_words_args,
valid_outputs,
streaming=streaming)
@pytest.mark.parametrize("use_py_session", [False, True],
ids=["use_cpp_session", "use_py_session"])
@pytest.mark.parametrize("streaming", [False, True],
ids=["non_streaming", "streaming"])
def test_llm_gpt2_medium_stop_words_1gpu(gpt_example_root, llm_venv,
llm_gpt2_medium_model_root, cmodel_dir,
engine_dir, use_py_session, streaming):
"gpt2 build & run"
if use_py_session and streaming:
pytest.skip(
"Streaming with py session does not return complete sequence to reliably check stop words"
)
print("Converting checkpoint...")
dtype = 'float16'
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-medium",
model_path=llm_gpt2_medium_model_root,
data_type=dtype)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
"--paged_kv_cache=enable",
"--remove_input_padding=enable",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
run_cmd = [
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
f"--engine_dir={engine_dir}",
f"--tokenizer_dir={llm_gpt2_medium_model_root}",
"--no_add_special_tokens"
]
if streaming:
run_cmd.append("--streaming")
if use_py_session:
run_cmd.append("--use_py_session")
valid_outputs = [
"chef before moving to London",
"chef and eventually became",
]
stop_words_args = ["--stop_words", " London", " became"]
run_and_check(llm_venv,
run_cmd + stop_words_args,
valid_outputs,
streaming=streaming)
stop_words_args = [
"--stop_words", " eventually became", " to London", " irrelevant output"
]
run_and_check(llm_venv,
run_cmd + stop_words_args,
valid_outputs,
streaming=streaming)
stop_words_args = [
"--stop_words", " to London", " eventually became", " irrelevant output"
]
run_and_check(llm_venv,
run_cmd + stop_words_args,
valid_outputs,
streaming=streaming)
@pytest.mark.skip_less_device(8)
@pytest.mark.parametrize(
"use_attention_plugin", [True, False],
ids=["enable_attention_plugin", "disable_attention_plugin"])
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
def test_llm_gpt3_175b_2layers_1node_8gpus(gpt_example_root, llm_venv,
engine_dir, use_attention_plugin,
use_gemm_plugin):
"Build & run GPT-3 175B: 2 layer w/ plugins, regression test for issues #20"
dtype = 'float16'
convert_cmd = [
f"{gpt_example_root}/../../../generate_checkpoint_config.py",
f"--output_path={engine_dir}/ckpt_config.json",
"--architecture=GPTForCausalLM", f"--dtype={dtype}",
"--num_hidden_layers=2", "--num_attention_heads=96",
"--hidden_size=12288", "--vocab_size=51200", "--tp_size=8"
]
venv_check_call(llm_venv, convert_cmd)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--model_config={engine_dir}/ckpt_config.json",
f"--output_dir={engine_dir}",
f"--max_batch_size={256}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
]
if use_attention_plugin:
build_cmd.extend([f"--gpt_attention_plugin={dtype}"])
else:
build_cmd.extend([
"--gpt_attention_plugin=disable",
"--context_fmha=disable",
"--paged_kv_cache=disable",
"--remove_input_padding=disable",
])
if use_gemm_plugin:
build_cmd.extend([f"--gemm_plugin={dtype}"])
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
venv_mpi_check_call(
llm_venv,
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "8"], [
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
f"--engine_dir={engine_dir}", "--no_add_special_tokens"
])
@pytest.mark.parametrize(
"use_attention_plugin", [True, False],
ids=["enable_attention_plugin", "disable_attention_plugin"])
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
def test_llm_gpt3_175b_96layers_build_only(gpt_example_root, llm_venv,
engine_dir, use_attention_plugin,
use_gemm_plugin):
"Build GPT-3 175B: 96 layer w/ plugins"
dtype = 'float16'
convert_cmd = [
f"{gpt_example_root}/../../../generate_checkpoint_config.py",
f"--output_path={engine_dir}/ckpt_config.json",
"--architecture=GPTForCausalLM", f"--dtype={dtype}",
"--num_hidden_layers=96", "--num_attention_heads=96",
"--hidden_size=12288", "--vocab_size=51200", "--tp_size=8"
]
venv_check_call(llm_venv, convert_cmd)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--model_config={engine_dir}/ckpt_config.json",
f"--output_dir={engine_dir}",
f"--max_batch_size={64}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
]
if use_attention_plugin:
build_cmd.extend([f"--gpt_attention_plugin={dtype}"])
else:
build_cmd.extend([
"--gpt_attention_plugin=disable",
"--context_fmha=disable",
"--paged_kv_cache=disable",
"--remove_input_padding=disable",
])
if use_gemm_plugin:
build_cmd.extend([f"--gemm_plugin={dtype}"])
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
@pytest.mark.skip_less_device(8)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize(
"use_attention_plugin", [True, False],
ids=["enable_attention_plugin", "disable_attention_plugin"])
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
@pytest.mark.parametrize("context_fmha", [True, False],
ids=["enable_fmha", "disable_fmha"])
@pytest.mark.parametrize("parallel_build", [True, False],
ids=["parallel_build", "serial_build"])
def test_llm_gpt3_175b_1node_8gpus(gpt_example_root, llm_venv, engine_dir,
use_attention_plugin, use_gemm_plugin,
context_fmha, parallel_build,
timeout_manager):
"Build & Run GPT-3 175B: 96 layer w/ plugins"
dtype = 'float16'
# Convert checkpoint with timeout management
with timeout_manager.timed_operation("convert"):
convert_cmd = [
f"{gpt_example_root}/../../../generate_checkpoint_config.py",
f"--output_path={engine_dir}/ckpt_config.json",
"--architecture=GPTForCausalLM", f"--dtype={dtype}",
"--num_hidden_layers=96", "--num_attention_heads=96",
"--hidden_size=12288", "--vocab_size=51200", "--tp_size=8"
]
venv_check_call(llm_venv,
convert_cmd,
timeout=timeout_manager.remaining_timeout)
# Build engines with timeout management
print("Building engines...")
with timeout_manager.timed_operation("build"):
build_cmd = [
"trtllm-build",
f"--model_config={engine_dir}/ckpt_config.json",
f"--output_dir={engine_dir}",
f"--max_batch_size={32}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
]
if use_attention_plugin:
build_cmd.extend([f"--gpt_attention_plugin={dtype}"])
if context_fmha:
build_cmd.extend(["--context_fmha=enable"])
else:
build_cmd.extend(["--context_fmha=disable"])
else:
build_cmd.extend([
"--gpt_attention_plugin=disable",
"--context_fmha=disable",
"--paged_kv_cache=disable",
"--remove_input_padding=disable",
])
if use_gemm_plugin:
build_cmd.extend([f"--gemm_plugin={dtype}"])
if parallel_build:
build_cmd.extend(["--workers=8"])
check_call(" ".join(build_cmd),
shell=True,
env=llm_venv._new_env,
timeout=timeout_manager.remaining_timeout)
# Run inference with timeout management
print('Run gpt3-175b...')
with timeout_manager.timed_operation("run"):
venv_mpi_check_call(
llm_venv,
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "8"], [
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
f"--engine_dir={engine_dir}", "--no_add_special_tokens"
],
timeout=timeout_manager.remaining_timeout)
@skip_post_blackwell
@pytest.mark.parametrize("per_token_channel", [True, False],
ids=["enable_ptpc", "disable_ptpc"])
def test_llm_gpt2_smooth_single_gpu_summary(gpt_example_root, llm_venv,
llm_gpt2_model_root,
llm_datasets_root, llm_rouge_root,
cmodel_dir, engine_dir,
per_token_channel):
"gpt2-smooth test on single gpu"
print("Converting checkpoint...")
dtype = 'float16'
ckpt_dir = convert_weights(
llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-smooth",
model_path=llm_gpt2_model_root,
data_type=dtype,
per_token=per_token_channel,
per_channel=per_token_channel,
calib_dataset=f"{llm_datasets_root}/cimec/lambada")
print("Building engines...")
# NOTE: SQ does not support OOTB path for attention for now.
# Check tensorrt_llm/quantization/layers.py::SmoothQuantAttention for details.
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Running inference...")
venv_check_call(llm_venv, [
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
f"--engine_dir={engine_dir}", f"--tokenizer_dir={llm_gpt2_model_root}",
"--no_add_special_tokens"
])
@skip_post_blackwell
def test_llm_gpt2_int8_kv_1gpu(gpt_example_root, llm_venv, llm_gpt2_model_root,
llm_datasets_root, engine_dir, cmodel_dir):
"gpt2 INT8 KV Cache test on 1 gpu"
print("Converting checkpoint...")
dtype = 'float16'
ckpt_dir = convert_weights(
llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-int8-kv",
model_path=llm_gpt2_model_root,
data_type=dtype,
calib_dataset=f"{llm_datasets_root}/cimec/lambada")
print("Building engines...")
# TODO: This case only support enable gpt attention plugin.
# https://nvbugs/4175869
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Running inference...")
venv_check_call(llm_venv, [
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
f"--engine_dir={engine_dir}", f"--tokenizer_dir={llm_gpt2_model_root}",
"--no_add_special_tokens"
])
@skip_pre_ada
@pytest.mark.parametrize("quant_lm_head", [True, False])
@pytest.mark.parametrize("qformat", ["fp8", "fp8_pc_pt"])
def test_llm_gpt2_medium_fp8(gpt_example_root, llm_gpt2_medium_model_root,
llm_datasets_root, llm_rouge_root, llm_venv,
cmodel_dir, engine_dir, quant_lm_head, qformat):
if qformat == "fp8_pc_pt" and quant_lm_head:
pytest.skip("Skipping test for fp8_pc_pt with quant_lm_head")
"Build & Run gpt2-medium fp8 with 1 gpu"
print("Quantizing and converting checkpoint...")
dtype = "float16"
ckpt_dir = f"{cmodel_dir}/gpt2-medium/fp8/1-gpu"
quantize_cmd = [
f"{gpt_example_root}/../../../quantization/quantize.py",
f"--model_dir={llm_gpt2_medium_model_root}",
f"--calib_dataset={llm_datasets_root}/cnn_dailymail",
f"--dtype={dtype}",
f"--qformat={qformat}",
f"--output_dir={ckpt_dir}",
]
if quant_lm_head:
quantize_cmd.append("--quantize_lm_head")
venv_check_call(llm_venv, quantize_cmd)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={1}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
f"--max_num_tokens={924}",
f"--gemm_plugin={dtype}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print('Run engines...')
rouge1_threshold = 22.8 if qformat == "fp8_pc_pt" else (
20.9 if quant_lm_head else 21.7)
summary_cmd = [
f"{gpt_example_root}/../../../summarize.py",
f"--engine_dir={engine_dir}",
f"--hf_model_dir={llm_gpt2_medium_model_root}", "--test_trt_llm",
"--check_accuracy",
f"--tensorrt_llm_rouge1_threshold={rouge1_threshold}",
f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
]
venv_check_call(llm_venv, summary_cmd)
@skip_pre_ada
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("llm_gpt2_starcoder_model_root",
["starcoder", "starcoderplus", "starcoder2"],
indirect=True)
def test_starcoder_fp8_quantization_2gpu(gpt_example_root,
llm_gpt2_starcoder_model_root,
llm_datasets_root, llm_rouge_root,
llm_venv, cmodel_dir, engine_dir):
"Build & Run gpt2-starcoder fp8 with 2 gpus"
print("Quantizing and converting checkpoint...")
dtype = "bfloat16"
ckpt_dir = f"{cmodel_dir}/gpt2-starcoder/fp8/2-gpu"
tp_size, pp_size = 2, 1
world_size = tp_size * pp_size
quantize_cmd = [
f"{gpt_example_root}/../../../quantization/quantize.py",
f"--model_dir={llm_gpt2_starcoder_model_root}",
f"--calib_dataset={llm_datasets_root}/cnn_dailymail",
f"--dtype={dtype}",
"--qformat=fp8",
"--kv_cache_dtype=fp8",
f"--calib_tp_size={tp_size}",
f"--tp_size={tp_size}",
f"--output_dir={ckpt_dir}",
]
venv_check_call(llm_venv, quantize_cmd)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={1}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
f"--max_num_tokens={924}",
f"--gemm_plugin={dtype}",
f"--workers={world_size}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print('Run engines...')
summary_cmd = [
f"{gpt_example_root}/../../../summarize.py",
f"--engine_dir={engine_dir}",
f"--hf_model_dir={llm_gpt2_starcoder_model_root}", "--test_trt_llm",
"--check_accuracy", "--tensorrt_llm_rouge1_threshold=17.5",
f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
]
venv_mpi_check_call(
llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
summary_cmd)
def test_llm_gpt2_next_1gpu(gpt_example_root, llm_venv,
llm_gpt2_next_model_root, engine_dir, cmodel_dir):
"RoPE is only supported with GPTAttention plugin"
print("Converting checkpoint...")
dtype = "bfloat16"
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-next",
model_path=llm_gpt2_next_model_root,
data_type=dtype)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Running inference...")
venv_check_call(llm_venv, [
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
f"--engine_dir={engine_dir}",
f"--vocab_file={ckpt_dir}/tokenizer.model", "--no_add_special_tokens"
])
# transformers compatibility issues
@pytest.mark.parametrize("tensor_parallel", [1, 2], ids=["tp1", "tp2"])
@pytest.mark.parametrize("use_py_session", [False, True],
ids=["use_cpp_session", "use_py_session"])
def test_llm_gpt2_next_prompt_tuning(gpt_example_root, llm_venv,
llm_gpt2_next_model_root, cmodel_dir,
engine_dir, tensor_parallel,
use_py_session):
f"gpt-next prompt tuning on {tensor_parallel} gpu(s)"
dtype = "bfloat16"
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-next",
model_path=llm_gpt2_next_model_root,
gpus=tensor_parallel,
tp_size=tensor_parallel,
data_type=dtype)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size=4",
f"--max_input_len=924",
f"--max_seq_len=1024",
f"--gpt_attention_plugin={dtype}",
"--max_prompt_embedding_table_size=200",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Converting prompt-tuning table...")
squad_table_nemo = Path(llm_gpt2_next_model_root
).parent / "p-tuning" / "gpt2b_gpt2-squad-vt60.nemo"
squad_table = Path(gpt_example_root) / "prompt_table_squad.npy"
train900_table_nemo = Path(
llm_gpt2_next_model_root
).parent / "p-tuning" / "gpt2b_gpt2b-train900-v2.nemo"
train900_table = Path(gpt_example_root) / "prompt_table_train900.npy"
for (in_file, out_file) in [(squad_table_nemo, squad_table),
(train900_table_nemo, train900_table)]:
table_conv_cmd = [
f"{gpt_example_root}/nemo_prompt_convert.py", "-i",
str(in_file), "-o",
str(out_file)
]
venv_check_call(llm_venv, table_conv_cmd)
merged_table = Path(gpt_example_root) / "prompt_table_train900.npy"
table_merge_cmd = [
f"{gpt_example_root}/merge_ptuning_tables.py",
str(squad_table),
str(train900_table),
str(merged_table)
]
venv_check_call(llm_venv, table_merge_cmd)
inference_params = {
"squad": {
"num_v_tokens":
50,
"input":
"Context: In Hinduism the spiritual teacher is known as a guru, and, in many traditions of Hinduism - especially those common in the West - the emphasis on spiritual mentorship is extremely high, with gurus often exercising a great deal of control over the lives of their disciples.\n\nQuestion: Who do gurus control?\n\nAnswer:",
"outputs": [
"The answer is, of course, the disciple.",
"The guru controls the disciple's life, but",
"The guru is the one who controls the disciple."
],
},
"train900": {
"num_v_tokens": 20,
"input":
"Context: Carlsen faced Anand in the World Chess Championship 2013, at Hyatt Regency in Chennai, India, from 9 to 22 November. Carlsen won the match 6.53.5 by winning games five, six and nine and drawing the remainder, becoming the new World Chess Champion.\n\nQuestion: When did Carlsen become World Chess Champion?\n\nAnswer:",
"outputs":
["2013", "2013" + os.linesep + os.linesep + "Question: Who"],
}
}
print("Running inference...")
def parse_output(text: str) -> list[str]:
results = []
while True:
match = re.search(
r"Output \[Text \d+ Beam \d+\]: \"([^\"]*)\"" + os.linesep,
text, re.MULTILINE)
if match is None:
break
_, end = match.span()
results.append(match.group(1))
text = text[end:]
return results
# test model without p-tuning dict
run_cmd = [
f"{gpt_example_root}/../../../run.py",
"--no_add_special_tokens",
"--max_output_len=10",
f"--engine_dir={engine_dir}",
f"--vocab_file={ckpt_dir}/tokenizer.model",
f"--input_text={inference_params['squad']['input']}",
]
if use_py_session:
run_cmd.append("--use_py_session")
output = venv_mpi_check_output(
llm_venv, ["mpirun", "-n", f"{tensor_parallel}", "--allow-run-as-root"],
run_cmd)
assert any(
similar(parse_output(output)[0][:len(ref) + 1], ref)
for ref in inference_params["squad"]["outputs"]), "incorrect output"
# test p-tuning task separately
run_cmd = [
f"{gpt_example_root}/../../../run.py",
"--no_add_special_tokens",
"--max_output_len=10",
f"--engine_dir={engine_dir}",
f"--vocab_file={ckpt_dir}/tokenizer.model",
f"--prompt_table={squad_table}",
f"--num_prepend_vtokens={inference_params['squad']['num_v_tokens']}",
f"--input_text={inference_params['squad']['input']}",
f"--no-kv_cache_enable_block_reuse",
]
if use_py_session:
run_cmd.append("--use_py_session")
output = venv_mpi_check_output(
llm_venv, ["mpirun", "-n", f"{tensor_parallel}", "--allow-run-as-root"],
run_cmd)
assert any(
similar(parse_output(output)[0][:len(ref) + 1], ref)
for ref in inference_params["squad"]["outputs"]), "incorrect output"
run_cmd = [
f"{gpt_example_root}/../../../run.py",
"--no_add_special_tokens",
"--max_output_len=10",
f"--engine_dir={engine_dir}",
f"--vocab_file={ckpt_dir}/tokenizer.model",
f"--prompt_table={train900_table}",
f"--num_prepend_vtokens={inference_params['train900']['num_v_tokens']}",
f"--input_text={inference_params['train900']['input']}",
f"--no-kv_cache_enable_block_reuse",
]
if use_py_session:
run_cmd.append("--use_py_session")
output = venv_mpi_check_output(
llm_venv, ["mpirun", "-n", f"{tensor_parallel}", "--allow-run-as-root"],
run_cmd)
assert any(
similar(parse_output(output)[0][:len(ref) + 1], ref)
for ref in inference_params["train900"]["outputs"]), "incorrect output"
# test batched p-tuning tasks
run_cmd = [
f"{gpt_example_root}/../../../run.py",
"--no_add_special_tokens",
"--max_output_len=10",
f"--engine_dir={engine_dir}",
f"--vocab_file={ckpt_dir}/tokenizer.model",
f"--prompt_table={merged_table}",
f"--num_prepend_vtokens",
str(inference_params['squad']['num_v_tokens']),
str(inference_params['train900']['num_v_tokens']),
f"--prompt_tasks=0,1",
f"--input_text",
inference_params["squad"]["input"],
inference_params['train900']['input'],
f"--no-kv_cache_enable_block_reuse",
]
if use_py_session:
run_cmd.append("--use_py_session")
output = venv_mpi_check_output(
llm_venv, ["mpirun", "-n", f"{tensor_parallel}", "--allow-run-as-root"],
run_cmd)
outputs = parse_output(output)
assert any(
similar(outputs[0][:len(ref) + 1], ref)
for ref in inference_params["squad"]["outputs"]), "incorrect output"
assert any(
similar(outputs[1][:len(ref) + 1], ref)
for ref in inference_params["train900"]["outputs"]), "incorrect output"
# test batched and streamed p-tuning tasks
# Streaming with py session does not return complete sequence to reliably check stop words"
if not use_py_session and tensor_parallel == 1:
run_cmd = [
f"{gpt_example_root}/../../../run.py",
"--no_add_special_tokens",
"--max_output_len=10",
f"--engine_dir={engine_dir}",
f"--vocab_file={ckpt_dir}/tokenizer.model",
f"--prompt_table={merged_table}",
f"--num_prepend_vtokens",
str(inference_params['squad']['num_v_tokens']),
str(inference_params['train900']['num_v_tokens']),
f"--prompt_tasks=0,1",
"--streaming",
f"--input_text",
inference_params["squad"]["input"],
inference_params['train900']['input'],
f"--no-kv_cache_enable_block_reuse",
]
output = venv_mpi_check_output(
llm_venv,
["mpirun", "-n", f"{tensor_parallel}", "--allow-run-as-root"],
run_cmd)
outputs = parse_output(output)
squad_outputs = outputs[::2]
train900_outputs = outputs[1::2]
for outputs, valid_outputs in [
(squad_outputs, inference_params["squad"]["outputs"]),
(train900_outputs, inference_params["train900"]["outputs"])
]:
assert any(
similar(outputs[-1][:len(ref) + 1], ref)
for ref in valid_outputs), "incorrect output"
similarities = []
for suboutput in outputs:
similarities.append(
max([
similarity_score(suboutput, expect)
for expect in valid_outputs
]))
assert (
all(x <= y for x, y in zip(similarities, similarities[1:]))
), f"streaming outputs must have a monotonically increasing similarity score. valid_outputs: {valid_outputs}, outputs: {outputs}, similarities: {similarities}"
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize(
"tp_pp_size", [(4, 1), (2, 2), (1, 4)],
ids=lambda tp_pp_size: f'tp{tp_pp_size[0]}pp{tp_pp_size[1]}')
def test_llm_gpt2_medium_1node_4gpus(gpt_example_root,
llm_gpt2_medium_model_root,
llm_datasets_root, llm_rouge_root,
llm_venv, cmodel_dir, engine_dir,
tp_pp_size):
print("Converting checkpoint...")
dtype = 'float16'
tp_size, pp_size = tp_pp_size
world_size = tp_size * pp_size
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-medium",
model_path=llm_gpt2_medium_model_root,
data_type=dtype,
gpus=world_size,
tp_size=tp_size,
pp_size=pp_size,
workers=world_size)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
"--max_batch_size=8",
"--max_input_len=924",
"--max_seq_len=1024",
f"--gemm_plugin={dtype}",
f"--workers={world_size}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Run engines...")
summary_cmd = [
f"{gpt_example_root}/../../../summarize.py", "--test_trt_llm",
f"--engine_dir={engine_dir}",
f"--hf_model_dir={llm_gpt2_medium_model_root}", "--check_accuracy",
"--tensorrt_llm_rouge1_threshold=19",
f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
]
venv_mpi_check_call(
llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
summary_cmd)
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize(
"use_attention_plugin", [True, False],
ids=["enable_attention_plugin", "disable_attention_plugin"])
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
@pytest.mark.parametrize("context_fmha", [True, False],
ids=["enable_fmha", "disable_fmha"])
@pytest.mark.parametrize("parallel_build", [True, False],
ids=["parallel_build", "serial_build"])
def test_llm_gpt2_santacoder_1node_4gpus(gpt_example_root,
llm_gpt2_santacoder_model_root,
llm_venv, engine_dir, cmodel_dir,
use_attention_plugin, use_gemm_plugin,
context_fmha, parallel_build):
"Build & Run GPT2 variant santacoder"
print("Converting checkpoint...")
dtype = 'float16'
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-santacoder",
model_path=llm_gpt2_santacoder_model_root,
data_type=dtype,
gpus=4,
tp_size=4)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
]
if use_attention_plugin:
build_cmd.extend([f"--gpt_attention_plugin={dtype}"])
if context_fmha:
build_cmd.extend(["--context_fmha=enable"])
else:
build_cmd.extend(["--context_fmha=disable"])
else:
build_cmd.extend([
"--gpt_attention_plugin=disable",
"--context_fmha=disable",
"--paged_kv_cache=disable",
"--remove_input_padding=disable",
])
if use_gemm_plugin:
build_cmd.extend([f"--gemm_plugin={dtype}"])
if parallel_build:
build_cmd.extend(["--workers=4"])
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print('Run gpt2-santacoder...')
venv_mpi_check_call(
llm_venv,
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "4"], [
f"{gpt_example_root}/../../../run.py", "--max_output_len=20",
f"--engine_dir={engine_dir}", "--tokenizer_dir",
llm_gpt2_santacoder_model_root, "--input_text",
"def print_hello_world():", "--no_add_special_tokens"
])
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize(
"use_attention_plugin", [True, False],
ids=["enable_attention_plugin", "disable_attention_plugin"])
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
@pytest.mark.parametrize("context_fmha", [True, False],
ids=["enable_fmha", "disable_fmha"])
@pytest.mark.parametrize("llm_gpt2_starcoder_model_root",
["starcoder", "starcoderplus", "starcoder2"],
indirect=True)
def test_llm_gpt2_starcoder_1node_4gpus(gpt_example_root,
llm_gpt2_starcoder_model_root,
llm_datasets_root, llm_rouge_root,
llm_venv, cmodel_dir, engine_dir,
use_attention_plugin, use_gemm_plugin,
context_fmha):
"Build & Run GPT2 variant starcoder"
print("Converting checkpoint...")
dtype = 'float16'
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-starcoder",
model_path=llm_gpt2_starcoder_model_root,
data_type=dtype,
gpus=4,
tp_size=4)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
"--workers=4",
]
if use_attention_plugin:
build_cmd.extend([f"--gpt_attention_plugin={dtype}"])
if context_fmha:
build_cmd.extend(["--context_fmha=enable"])
else:
build_cmd.extend(["--context_fmha=disable"])
else:
build_cmd.extend([
"--gpt_attention_plugin=disable",
"--context_fmha=disable",
"--paged_kv_cache=disable",
"--remove_input_padding=disable",
])
if use_gemm_plugin:
build_cmd.extend([f"--gemm_plugin={dtype}"])
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print('Run gpt2-starcoder...')
venv_mpi_check_call(
llm_venv,
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "4"], [
f"{gpt_example_root}/../../../run.py",
"--max_output_len=20",
f"--engine_dir={engine_dir}",
"--tokenizer_dir",
llm_gpt2_starcoder_model_root,
"--input_text",
"def print_hello_world():",
"--no_add_special_tokens",
])
summary_cmd = generate_summary_cmd(
gpt_example_root,
"no_add_special_tokens",
batch_size=1,
engine_dir=engine_dir,
eval_task="code_completion",
hf_model_dir=llm_gpt2_starcoder_model_root,
max_attention_window_size=4096,
tensorrt_llm_rouge1_threshold=25,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
print('Run gpt2-starcoder summarize...')
venv_mpi_check_call(
llm_venv,
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "4"],
summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_host_memory(250000)
def test_llm_gpt2_starcoder_1gpus(gpt_example_root,
llm_gpt2_starcoder_model_root, llm_venv,
engine_dir, cmodel_dir):
"Build & Run GPT2 variant starcoder on single gpu"
print("Converting checkpoint...")
print(f"cmodel dir is {cmodel_dir}")
dtype = 'float16'
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-starcoder",
model_path=llm_gpt2_starcoder_model_root,
data_type=dtype)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
"--context_fmha=enable",
f"--gemm_plugin={dtype}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print('Run gpt2-starcoder...')
summary_cmd = [
f"{gpt_example_root}/../../../run.py", "--max_output_len=20",
f"--engine_dir={engine_dir}", "--tokenizer_dir",
llm_gpt2_starcoder_model_root, "--input_text",
"def print_hello_world():", "--no_add_special_tokens"
]
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_host_memory(250000)
@pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("precision", ["int8", "int4"])
@pytest.mark.parametrize("llm_gpt2_starcoder_model_root",
["starcoder", "starcoderplus", "starcoder2"],
indirect=True)
def test_llm_gpt2_starcoder_weight_only(gpt_example_root,
llm_gpt2_starcoder_model_root,
llm_datasets_root, llm_rouge_root,
llm_venv, cmodel_dir, engine_dir, dtype,
precision):
"Build & Run GPT2 variant starcoder with int8/int4 weight only"
print("Converting checkpoint...")
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-starcoder",
model_path=llm_gpt2_starcoder_model_root,
data_type=dtype,
use_weight_only=True,
weight_only_precision=precision)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={924}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
"--context_fmha=enable",
f"--gemm_plugin={dtype}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print('Run gpt2-starcoder...')
summary_cmd = [
f"{gpt_example_root}/../../../run.py",
"--max_output_len=20",
f"--engine_dir={engine_dir}",
"--tokenizer_dir",
llm_gpt2_starcoder_model_root,
"--input_text",
"def print_hello_world():",
"--no_add_special_tokens",
]
venv_check_call(llm_venv, summary_cmd)
summary_cmd = generate_summary_cmd(
gpt_example_root,
"no_add_special_tokens",
batch_size=1,
engine_dir=engine_dir,
eval_task="code_completion",
hf_model_dir=llm_gpt2_starcoder_model_root,
max_attention_window_size=4096,
tensorrt_llm_rouge1_threshold=25,
dataset_dir=llm_datasets_root,
rouge_dir=llm_rouge_root)
print('Run gpt2-starcoder summarize...')
venv_check_call(llm_venv, summary_cmd)
@pytest.mark.parametrize("tensor_parallel", [1, 2], ids=["tp1", "tp2"])
@pytest.mark.parametrize("dtype", ["float16", "bfloat16"])
def test_llm_gpt2_starcoder2(gpt_example_root, llm_gpt2_starcoder2_model_root,
llm_datasets_root, llm_rouge_root, llm_venv,
cmodel_dir, engine_dir, dtype, tensor_parallel):
"Build & Run GPT2 variant starcoder2 on single gpu"
print("Converting checkpoint...")
print(f"cmodel dir is {cmodel_dir}")
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-starcoder2",
model_path=llm_gpt2_starcoder2_model_root,
data_type=dtype,
gpus=tensor_parallel,
tp_size=tensor_parallel)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={1}",
f"--max_input_len={1024}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
"--context_fmha=enable",
f"--gemm_plugin={dtype}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print('Run gpt2-starcoder...')
venv_mpi_check_call(
llm_venv,
parse_mpi_cmd([
"mpirun", "--allow-run-as-root", "--oversubscribe", "-np",
str(tensor_parallel)
]), [
f"{gpt_example_root}/../../../summarize.py", "--batch_size=1",
f"--engine_dir={engine_dir}", "--test_trt_llm", "--check_accuracy",
"--eval_task=code_completion",
f"--hf_model_dir={llm_gpt2_starcoder2_model_root}",
"--no_add_special_tokens", "--max_attention_window_size=4096",
"--tensorrt_llm_rouge1_threshold=25",
f"--dataset_dir={llm_datasets_root}",
f"--rouge_dir={llm_rouge_root}"
])
@pytest.mark.parametrize("qformat", ["fp8", "full_prec"])
@pytest.mark.parametrize("dtype", ["float16", "bfloat16"])
@pytest.mark.parametrize("minitron_model_root", ["4b"], indirect=True)
def test_llm_minitron(gpt_example_root, minitron_model_root, llm_datasets_root,
llm_rouge_root, llm_venv, cmodel_dir, engine_dir, dtype,
qformat):
skip_fp8_pre_ada(qformat == 'fp8')
"Build & Run GPT2 variant minitron on single gpu"
if qformat == 'fp8':
print("Quantizing and converting checkpoint...")
ckpt_dir = f"{cmodel_dir}/minitron/fp8/1-gpu"
quantize_cmd = [
f"{gpt_example_root}/../../../quantization/quantize.py",
f"--model_dir={minitron_model_root}",
f"--calib_dataset={llm_datasets_root}/cnn_dailymail",
f"--dtype={dtype}",
"--qformat=fp8",
"--kv_cache_dtype=fp8",
f"--output_dir={ckpt_dir}",
]
venv_check_call(llm_venv, quantize_cmd)
else:
print(f"Converting checkpoint...")
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-minitron",
model_path=minitron_model_root,
data_type=dtype,
gpus=1,
tp_size=1)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={1}",
f"--max_input_len={1024}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
"--context_fmha=enable",
f"--gemm_plugin={dtype}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print('Run Minitron...')
venv_mpi_check_call(
llm_venv,
parse_mpi_cmd(
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "1"]), [
f"{gpt_example_root}/../../../summarize.py", "--batch_size=1",
f"--engine_dir={engine_dir}", "--test_trt_llm",
"--check_accuracy", "--eval_task", "code_completion",
"--hf_model_dir", minitron_model_root,
"--no_add_special_tokens", "--max_attention_window_size=4096",
"--tensorrt_llm_rouge1_threshold=29",
f"--dataset_dir={llm_datasets_root}",
f"--rouge_dir={llm_rouge_root}"
])
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("embedding_sharding_dim", [0, 1])
@pytest.mark.parametrize("dtype", ["float16"])
def test_llm_gpt2_parallel_embedding_2gpu(gpt_example_root, llm_venv,
llm_gpt2_model_root,
llm_datasets_root, llm_rouge_root,
cmodel_dir, engine_dir,
embedding_sharding_dim, dtype):
"GPT2 with parallel embedding"
print("Converting checkpoint...")
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2",
model_path=llm_gpt2_model_root,
data_type=dtype,
gpus=2,
tp_size=2,
use_parallel_embedding=True,
embedding_sharding_dim=embedding_sharding_dim)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_input_len={1000}",
f"--max_seq_len={1024}",
f"--gpt_attention_plugin={dtype}",
"--workers=2",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Running inference...")
venv_mpi_check_call(llm_venv, [
"mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "2"
], [
f"{gpt_example_root}/../../../summarize.py", "--batch_size=8",
"--test_trt_llm", "--check_accuracy",
"--tensorrt_llm_rouge1_threshold=13.5", f"--engine_dir={engine_dir}",
f"--hf_model_dir={llm_gpt2_model_root}", "--no_add_special_tokens",
f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
])
@pytest.mark.parametrize("llm_gpt2b_lora_model_root",
[("gpt2b_lora-900.nemo", "gpt2b_lora-stories.nemo")],
ids=["900_stories"],
indirect=True)
def test_llm_gpt2_multi_lora_1gpu(gpt_example_root, llm_venv,
llm_gpt2_next_model_root, cmodel_dir,
engine_dir, llm_gpt2b_lora_model_root):
"gpt2 run lora with nemo checkpoint on 1 gpu"
print("Converting checkpoint...")
dtype = "float16"
ckpt_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model="gpt2-next-lora",
model_path=llm_gpt2_next_model_root,
data_type=dtype)
print("Building engines...")
lora_900, lora_stories = llm_gpt2b_lora_model_root.split(",")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={4}",
f"--max_input_len={512}",
f"--max_seq_len={562}",
f"--max_beam_width={2}",
f"--gpt_attention_plugin={dtype}",
"--remove_input_padding=enable",
"--paged_kv_cache=enable",
"--context_fmha=enable",
f"--lora_plugin={dtype}",
"--lora_dir",
lora_900,
lora_stories,
"--lora_ckpt_source=nemo",
"--lora_target_modules",
"attn_qkv",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
run_cmd = [
f"{gpt_example_root}/../../../run.py",
"--max_output_len=20",
"--use_py_session",
f"--vocab_file={ckpt_dir}/tokenizer.model",
f"--engine_dir={engine_dir}",
"--lora_task_uids",
"0",
"-1",
"1",
"--no_add_special_tokens",
"--input_text",
INPUT_TEXT_1,
INPUT_TEXT_2,
INPUT_TEXT_2,
]
output = venv_check_output(llm_venv, run_cmd)
output = parse_output(output)
expected_output = [
[
"He surprised the Canadians on May 28 in what became known as the Battle of Jumonville",
"Washington, with Tanaghrisson and his party, surprised the Canadians on May 28 in"
],
[
"The game is played with a deck of cards, and the player who has the most"
],
[
"You are a wizard who is a wizard. You are a wizard who is",
'The job title is "Spellcaster" and the job description is "Spell"'
],
]
for idx, result in enumerate(output):
assert any([similar(item, result)
for item in expected_output[idx]]), f"output is {output}"
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("data_type", ['float16', 'fp8'],
ids=['base_fp16', 'base_fp8'])
@pytest.mark.parametrize("lora_data_type", ['float16'], ids=['lora_fp16'])
@pytest.mark.parametrize("llm_gpt2_starcoder_model_root", ['starcoder2'],
indirect=True)
@pytest.mark.parametrize("llm_lora_model_root",
['peft-lora-starcoder2-15b-unity-copilot'],
indirect=True)
def test_llm_gpt_starcoder_lora_1gpu(data_type, lora_data_type,
gpt_example_root,
llm_gpt2_starcoder_model_root,
llm_datasets_root, llm_venv, cmodel_dir,
engine_dir, llm_lora_model_root,
qcache_dir):
"run starcoder2 lora test on 1gpu"
if data_type == 'fp8':
skip_fp8_pre_ada(use_fp8=True)
else:
if get_device_memory() < 80000:
pytest.skip("GPU memory is not sufficient.")
print("Converting checkpoint...")
model_name = 'starcoder2-lora'
if data_type == 'fp8':
model_dir = quantize_data(
llm_venv,
gpt_example_root,
model_dir=llm_gpt2_starcoder_model_root,
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
dtype="float16",
qformat="fp8",
kv_cache_dtype="fp8",
quantize_dir=qcache_dir,
calib_size=512)
else:
model_dir = convert_weights(llm_venv=llm_venv,
example_root=gpt_example_root,
cmodel_dir=cmodel_dir,
model=model_name,
model_path=llm_gpt2_starcoder_model_root)
print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
"--lora_plugin=auto",
"--gemm_plugin=auto",
f"--lora_dir={llm_lora_model_root}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
ref_1 = [
610, 1489, 100, 7670, 100, 5879, 2284, 303, 1489, 459, 8302, 10914,
16013, 222, 222, 610, 1489, 100, 7670, 100, 5879, 100, 115, 100, 5598,
45, 115
]
ref_2 = [
610, 1489, 100, 7670, 100, 5879, 2284, 303, 1489, 459, 8302, 10914, 678,
222, 222, 610, 1489, 100, 7670, 100, 5879, 100, 115, 100, 5598, 45, 115
]
input_text = "def print_hello_world():"
print(f"Run inference with lora id 0...")
venv_check_call(llm_venv, [
f"{gpt_example_root}/../../../run.py",
"--max_output_len=20",
f"--input_text={input_text}",
"--lora_task_uids=0",
f"--tokenizer_dir={llm_gpt2_starcoder_model_root}",
f"--engine_dir={engine_dir}",
f"--output_csv={llm_venv.get_working_directory()}/use_lora.csv",
"--no_add_special_tokens",
"--use_py_session",
])
with open(f"{llm_venv.get_working_directory()}/use_lora.csv") as f:
predict = csv.reader(f)
predict = next(predict)
predict = [int(p) for p in predict]
assert ref_1 == predict or data_type != "float16"
print(f"Run inference with lora id -1...")
venv_check_call(llm_venv, [
f"{gpt_example_root}/../../../run.py",
"--max_output_len=20",
f"--input_text={input_text}",
"--lora_task_uids=-1",
f"--tokenizer_dir={llm_gpt2_starcoder_model_root}",
f"--engine_dir={engine_dir}",
f"--output_csv={llm_venv.get_working_directory()}/no_lora.csv",
"--no_add_special_tokens",
"--use_py_session",
])
with open(f"{llm_venv.get_working_directory()}/no_lora.csv") as f:
predict = csv.reader(f)
predict = next(predict)
predict = [int(p) for p in predict]
assert ref_2 == predict or data_type != "float16"
@pytest.mark.parametrize("llm_gpt2_starcoder_model_root", ['starcoder2'],
indirect=True)
def test_llm_starcoder2_sqootb_single_gpu(gpt_example_root, llm_venv,
llm_gpt2_starcoder_model_root,
llm_datasets_root, llm_rouge_root,
cmodel_dir, engine_dir):
"Starcoder2-smooth test on single gpu"
print("Quantization...")
dtype = 'float16'
ckpt_dir = f"{cmodel_dir}/starcoder2/int8_sq/1-gpu"
quantize_cmd = [
f"{gpt_example_root}/../../../quantization/quantize.py",
f"--model_dir={llm_gpt2_starcoder_model_root}",
f"--calib_dataset={llm_datasets_root}/cnn_dailymail",
f"--dtype={dtype}",
"--qformat=int8_sq",
f"--output_dir={ckpt_dir}",
]
venv_check_call(llm_venv, quantize_cmd)
print("Building engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={8}",
f"--max_seq_len={4096}",
f"--gpt_attention_plugin={dtype}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print('Run starcoder2...')
venv_mpi_check_call(
llm_venv,
parse_mpi_cmd(
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "1"]), [
f"{gpt_example_root}/../../../summarize.py", "--batch_size=1",
f"--engine_dir={engine_dir}", "--test_trt_llm",
"--check_accuracy", "--eval_task", "code_completion",
f"--hf_model_dir={llm_gpt2_starcoder_model_root}",
"--no_add_special_tokens", "--max_attention_window_size=4096",
"--tensorrt_llm_rouge1_threshold=25",
f"--dataset_dir={llm_datasets_root}",
f"--rouge_dir={llm_rouge_root}"
])
@skip_pre_ada
@pytest.mark.parametrize("minitron_model_root", ["4b"], indirect=True)
def test_llm_minitron_fp8_with_pseudo_loras(gpt_example_root,
minitron_model_root,
llm_datasets_root,
llm_venv,
cmodel_dir,
engine_dir,
dtype='bfloat16'):
"Run Minitron model with multiple pseudo LoRAs."
# Quantize the base model to fp8.
print("Quantizing and converting checkpoint...")
ckpt_dir = f"{cmodel_dir}/minitron/fp8/1-gpu"
quantize_cmd = [
f"{gpt_example_root}/../../../quantization/quantize.py",
f"--model_dir={minitron_model_root}",
f"--calib_dataset={llm_datasets_root}/cnn_dailymail",
f"--dtype={dtype}",
"--qformat=fp8",
"--kv_cache_dtype=fp8",
f"--output_dir={ckpt_dir}",
]
venv_check_call(llm_venv, quantize_cmd)
test_multi_lora_support(
hf_model_dir=minitron_model_root,
tllm_ckpt_dir=ckpt_dir,
engine_dir=engine_dir,
llm_venv=llm_venv,
example_root=gpt_example_root,
num_loras=2,
lora_rank=8,
target_hf_modules=["q_proj", "k_proj", "v_proj"],
target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
zero_lora_weights=True,
)