mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
1996 lines
77 KiB
Python
1996 lines
77 KiB
Python
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||
# SPDX-License-Identifier: Apache-2.0
|
||
#
|
||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
# you may not use this file except in compliance with the License.
|
||
# You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing, software
|
||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
# See the License for the specific language governing permissions and
|
||
# limitations under the License.
|
||
"""Module test_gpt test gpt examples."""
|
||
import csv
|
||
import os
|
||
import re
|
||
from pathlib import Path
|
||
|
||
import defs.ci_profiler
|
||
import pytest
|
||
from defs.common import (convert_weights, generate_summary_cmd, parse_mpi_cmd,
|
||
parse_output, quantize_data, run_and_check, similar,
|
||
similarity_score, test_multi_lora_support,
|
||
venv_check_call, venv_check_output,
|
||
venv_mpi_check_call, venv_mpi_check_output)
|
||
from defs.conftest import (get_device_memory, get_sm_version, skip_fp8_pre_ada,
|
||
skip_post_blackwell, skip_pre_ada)
|
||
from defs.trt_test_alternative import check_call
|
||
|
||
from tensorrt_llm import LLM
|
||
from tensorrt_llm.executor.request import LoRARequest
|
||
from tensorrt_llm.lora_manager import LoraConfig
|
||
from tensorrt_llm.sampling_params import SamplingParams
|
||
|
||
# skip trt flow cases on post-Blackwell-Ultra
|
||
if get_sm_version() >= 103:
|
||
pytest.skip(
|
||
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
|
||
allow_module_level=True)
|
||
|
||
INPUT_TEXT_1 = "After Washington had returned to Williamsburg, " + \
|
||
"Dinwiddie ordered him to lead a larger force to assist Trent in his work. " + \
|
||
"While en route, Washington learned of Trent's retreat. " + \
|
||
"Since Tanaghrisson had promised support to the British, " + \
|
||
"Washington continued toward Fort Duquesne and met with the Mingo leader. " + \
|
||
"Learning of a French scouting party in the area, Washington, " + \
|
||
"with Tanaghrisson and his party, surprised the Canadians on May 28 " + \
|
||
"in what became known as the Battle of Jumonville Glen. " + \
|
||
"They killed many of the Canadians, including their commanding officer, " + \
|
||
"Joseph Coulon de Jumonville, whose head was reportedly split open by " + \
|
||
"Tanaghrisson with a tomahawk. The historian Fred Anderson suggests that " + \
|
||
"Tanaghrisson was acting to gain the support of the British and regain " + \
|
||
"authority over his own people. They had been inclined to support the French, " + \
|
||
"with whom they had long trading relationships. One of Tanaghrisson's men told " + \
|
||
"Contrecoeur that Jumonville had been killed by British musket fire. " + \
|
||
"Question: Upon learning of a French scounting party in the area, " + \
|
||
"what did Washington do? Answer:"
|
||
|
||
INPUT_TEXT_2 = "You hold the job title in the Wizarding World of Harry Potter where you " + \
|
||
"say random words looking for spells"
|
||
|
||
|
||
@pytest.mark.parametrize("num_beams", [1, 4],
|
||
ids=["num_beams_1", "num_beams_4"])
|
||
@pytest.mark.parametrize(
|
||
"return_all_generated_tokens", [True, False],
|
||
ids=["return_all_generated_tokens", "disable_return_all_generated_tokens"])
|
||
@pytest.mark.parametrize("batch_size", [1, 3],
|
||
ids=["batch_size_1", "batch_size_3"])
|
||
def test_streaming_beam(gpt_example_root, llm_venv, llm_gpt2_model_root,
|
||
engine_dir, cmodel_dir, num_beams,
|
||
return_all_generated_tokens, batch_size):
|
||
""" Test the correctness of beam search + streaming versus the outputs of
|
||
non-streaming beam search. Both use the cpp runtime.
|
||
The num_beams=1 test acts as a test for `return_all_generated_tokens`"""
|
||
|
||
dtype = 'float16'
|
||
output_len = 10
|
||
texts = ["want to", "Movies are just", "Soyer was"]
|
||
input_text = texts[:batch_size]
|
||
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2",
|
||
model_path=llm_gpt2_model_root,
|
||
data_type=dtype)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
f"--gemm_plugin={dtype}",
|
||
f"--max_beam_width={num_beams}",
|
||
"--context_fmha=enable",
|
||
"--use_paged_context_fmha=enable",
|
||
"--paged_kv_cache=enable",
|
||
"--remove_input_padding=enable",
|
||
]
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print("Running inference...")
|
||
|
||
streaming_command = [
|
||
f"{gpt_example_root}/../../../run.py", "--no_add_special_tokens",
|
||
f"--max_output_len={output_len}", f"--engine_dir={engine_dir}",
|
||
f"--tokenizer_dir={llm_gpt2_model_root}", f"--streaming",
|
||
f"--streaming_interval=1", f"--num_beams={num_beams}", f"--input_text",
|
||
*input_text
|
||
]
|
||
if return_all_generated_tokens:
|
||
streaming_command += ["--return_all_generated_tokens"]
|
||
streaming_outputs = venv_check_output(llm_venv, streaming_command)
|
||
|
||
joined_nonstreamed_outputs = ""
|
||
for length_iterator in range(1, output_len + 1):
|
||
command = [
|
||
f"{gpt_example_root}/../../../run.py", "--no_add_special_tokens",
|
||
f"--max_output_len={length_iterator}", f"--engine_dir={engine_dir}",
|
||
f"--tokenizer_dir={llm_gpt2_model_root}",
|
||
f"--num_beams={num_beams}", f"--input_text", *input_text
|
||
]
|
||
if return_all_generated_tokens:
|
||
command += ["--return_all_generated_tokens"]
|
||
|
||
non_streaming_output = venv_check_output(llm_venv, command)
|
||
joined_nonstreamed_outputs += "Output from command" + str(
|
||
command) + "\n" + non_streaming_output
|
||
|
||
def parse_output(text: str) -> list[str]:
|
||
results = []
|
||
while True:
|
||
match = re.search(
|
||
r"Output \[Text \d+ Beam \d+\]: \"([^\"]*)\"\r?\n", text)
|
||
if match is None:
|
||
break
|
||
_, end = match.span()
|
||
results.append(match.group(1))
|
||
text = text[end:]
|
||
return results
|
||
|
||
print("STREAMING OUTPUT HERE\n\n\n",
|
||
streaming_outputs,
|
||
"\n\n\n",
|
||
sep="----")
|
||
print("NON-STREAMING OUTPUT HERE\n\n\n",
|
||
joined_nonstreamed_outputs,
|
||
"\n\n\n",
|
||
sep="----")
|
||
parsed_streamed_outputs = parse_output(streaming_outputs)
|
||
parsed_nonstreamed_outputs = parse_output(joined_nonstreamed_outputs)
|
||
|
||
def ordered_subset(s1, s2):
|
||
"""
|
||
Use this to check if the streamed outputs are an ordered subset of nonstreamed
|
||
Streaming can sometimes skip outputs
|
||
"""
|
||
s2 = iter(s2)
|
||
try:
|
||
for c in s1:
|
||
while next(s2) != c:
|
||
pass
|
||
else:
|
||
return True
|
||
except StopIteration:
|
||
return False
|
||
|
||
streaming_is_subset = ordered_subset(parsed_streamed_outputs,
|
||
parsed_nonstreamed_outputs)
|
||
print("streaming_is_subset ", streaming_is_subset)
|
||
assert streaming_is_subset
|
||
is_equal = (parsed_streamed_outputs == parsed_nonstreamed_outputs)
|
||
print("is_equal", is_equal)
|
||
if not is_equal:
|
||
print("Differences:")
|
||
for streamed, nonstreamed in zip(parsed_streamed_outputs,
|
||
parsed_nonstreamed_outputs):
|
||
if (streamed != nonstreamed):
|
||
print("Streamed:", streamed)
|
||
print("Nonstreamed:", nonstreamed)
|
||
|
||
# streaming can can skip outputs, if the next set of outputs arrive.
|
||
# this means that the is_equal flag is currently flaky: https://nvbugspro.nvidia.com/bug/4851644
|
||
# assert is_equal
|
||
|
||
|
||
def test_llm_gpt2_kv_cache_1gpu(gpt_example_root, llm_venv, llm_gpt2_model_root,
|
||
engine_dir, cmodel_dir):
|
||
"gpt2 cases on 1 gpu"
|
||
print("Converting checkpoint...")
|
||
dtype = 'float16'
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2",
|
||
model_path=llm_gpt2_model_root,
|
||
data_type=dtype)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
f"--gemm_plugin={dtype}",
|
||
"--context_fmha=enable",
|
||
"--use_paged_context_fmha=enable",
|
||
"--paged_kv_cache=enable",
|
||
"--remove_input_padding=enable",
|
||
]
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print("Running inference...")
|
||
|
||
venv_check_call(llm_venv, [
|
||
f"{gpt_example_root}/../../../summarize.py",
|
||
f"--engine_dir={engine_dir}",
|
||
"--test_hf",
|
||
"--batch_size=1",
|
||
"--test_trt_llm",
|
||
f"--hf_model_dir={llm_gpt2_model_root}",
|
||
"--check_accuracy",
|
||
"--tensorrt_llm_rouge1_threshold=13.5",
|
||
"--no_add_special_tokens",
|
||
"--max_tokens_in_paged_kv_cache=1024",
|
||
])
|
||
|
||
venv_check_call(llm_venv, [
|
||
f"{gpt_example_root}/../../../summarize.py",
|
||
f"--engine_dir={engine_dir}",
|
||
"--test_hf",
|
||
"--batch_size=1",
|
||
"--test_trt_llm",
|
||
f"--hf_model_dir={llm_gpt2_model_root}",
|
||
"--check_accuracy",
|
||
"--tensorrt_llm_rouge1_threshold=13.5",
|
||
"--no_add_special_tokens",
|
||
"--kv_cache_enable_block_reuse",
|
||
"--kv_cache_free_gpu_memory_fraction=0.5",
|
||
])
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"use_attention_plugin", [True, False],
|
||
ids=["enable_attention_plugin", "disable_attention_plugin"])
|
||
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
|
||
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
|
||
def test_llm_gpt2_1gpu(gpt_example_root, llm_venv, llm_gpt2_model_root,
|
||
llm_datasets_root, llm_rouge_root, engine_dir,
|
||
cmodel_dir, use_attention_plugin, use_gemm_plugin):
|
||
"gpt2 cases on 1 gpu"
|
||
print("Converting checkpoint...")
|
||
dtype = 'float16'
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2",
|
||
model_path=llm_gpt2_model_root,
|
||
data_type=dtype)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
]
|
||
|
||
if use_attention_plugin:
|
||
build_cmd.extend([f"--gpt_attention_plugin={dtype}"])
|
||
else:
|
||
build_cmd.extend([
|
||
"--gpt_attention_plugin=disable",
|
||
"--context_fmha=disable",
|
||
"--paged_kv_cache=disable",
|
||
"--remove_input_padding=disable",
|
||
])
|
||
if use_gemm_plugin:
|
||
build_cmd.extend([f"--gemm_plugin={dtype}"])
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print("Running inference...")
|
||
venv_check_call(llm_venv, [
|
||
f"{gpt_example_root}/../../../summarize.py",
|
||
f"--engine_dir={engine_dir}", "--test_hf", "--batch_size=1",
|
||
"--test_trt_llm", f"--hf_model_dir={llm_gpt2_model_root}",
|
||
"--check_accuracy", "--tensorrt_llm_rouge1_threshold=13.5",
|
||
"--no_add_special_tokens", f"--dataset_dir={llm_datasets_root}",
|
||
f"--rouge_dir={llm_rouge_root}"
|
||
])
|
||
|
||
if not use_gemm_plugin:
|
||
print("Checking embedding sharing...")
|
||
# Embedding sharing should be enabled automatically.
|
||
# Gpt2 has 124M parameters among which 36.8M are shared between embedding and lm_head.
|
||
# If embedding sharing is enabled, the FP16 engine size should be about 248 MB;
|
||
# otherwise, the engine size should be about 321.6 MB.
|
||
engine_size = os.path.getsize(f"{engine_dir}/rank0.engine") / (1024**2)
|
||
assert engine_size < 280
|
||
|
||
|
||
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
|
||
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
|
||
@pytest.mark.parametrize("use_py_session", [False, True],
|
||
ids=["use_cpp_session", "use_py_session"])
|
||
@pytest.mark.parametrize("streaming", [False, True],
|
||
ids=["non_streaming", "streaming"])
|
||
def test_llm_gpt2_medium_1gpu(gpt_example_root, llm_venv,
|
||
llm_gpt2_medium_model_root, cmodel_dir,
|
||
engine_dir, use_gemm_plugin, use_py_session,
|
||
streaming):
|
||
"gpt2-medium build & run"
|
||
print("Converting checkpoint...")
|
||
dtype = 'float16'
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-medium",
|
||
model_path=llm_gpt2_medium_model_root,
|
||
data_type=dtype)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
"--paged_kv_cache=enable",
|
||
"--remove_input_padding=enable",
|
||
]
|
||
|
||
if use_gemm_plugin:
|
||
build_cmd.extend([f"--gemm_plugin={dtype}"])
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
run_cmd = [
|
||
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--tokenizer_dir={llm_gpt2_medium_model_root}",
|
||
"--no_add_special_tokens"
|
||
]
|
||
|
||
if streaming:
|
||
run_cmd.append("--streaming")
|
||
if use_py_session:
|
||
run_cmd.append("--use_py_session")
|
||
|
||
print("Running inference...")
|
||
output = venv_check_output(llm_venv, run_cmd)
|
||
|
||
valid_outputs = [
|
||
"chef before moving to London in the early",
|
||
"chef before moving to London in the late",
|
||
"chef and eventually became a chef at a",
|
||
]
|
||
|
||
if not streaming:
|
||
output = parse_output(output)[0]
|
||
assert any([similar(output, expect)
|
||
for expect in valid_outputs]), f"output is: {output}"
|
||
else:
|
||
# Fetch all outputs and expect a monotonically increasing similarity
|
||
similarities = []
|
||
for suboutput in parse_output(output):
|
||
similarities.append(
|
||
max([
|
||
similarity_score(suboutput, expect)
|
||
for expect in valid_outputs
|
||
]))
|
||
assert (
|
||
all(x <= y for x, y in zip(similarities, similarities[1:]))
|
||
), f"streaming outputs must have a monotonically increasing similarity score. similarities: {similarities}"
|
||
output = parse_output(output)[-1]
|
||
assert any([similar(output, expect)
|
||
for expect in valid_outputs]), f"output is: {output}"
|
||
|
||
|
||
@pytest.mark.parametrize("use_py_session", [False, True],
|
||
ids=["use_cpp_session", "use_py_session"])
|
||
@pytest.mark.parametrize("streaming", [False, True],
|
||
ids=["non_streaming", "streaming"])
|
||
def test_llm_gpt2_medium_bad_words_1gpu(gpt_example_root, llm_venv,
|
||
llm_gpt2_medium_model_root, cmodel_dir,
|
||
engine_dir, use_py_session, streaming):
|
||
"gpt2 build & run"
|
||
|
||
if use_py_session and streaming:
|
||
pytest.skip(
|
||
"Streaming with py session does not return complete sequence to reliably check stop words"
|
||
)
|
||
|
||
print("Converting checkpoint...")
|
||
dtype = 'float16'
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-medium",
|
||
model_path=llm_gpt2_medium_model_root,
|
||
data_type=dtype)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
"--paged_kv_cache=enable",
|
||
"--remove_input_padding=enable",
|
||
]
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
run_cmd = [
|
||
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--tokenizer_dir={llm_gpt2_medium_model_root}",
|
||
"--no_add_special_tokens"
|
||
]
|
||
|
||
if streaming:
|
||
run_cmd.append("--streaming")
|
||
if use_py_session:
|
||
run_cmd.append("--use_py_session")
|
||
|
||
valid_outputs = [
|
||
"chef before moving to the UK in the",
|
||
"chef and eventually became a chef at a",
|
||
]
|
||
bad_words_args = ["--bad_words", " London"]
|
||
run_and_check(llm_venv,
|
||
run_cmd + bad_words_args,
|
||
valid_outputs,
|
||
streaming=streaming)
|
||
|
||
bad_words_args = ["--bad_words", " to London", " irrelevant words"]
|
||
run_and_check(llm_venv,
|
||
run_cmd + bad_words_args,
|
||
valid_outputs,
|
||
streaming=streaming)
|
||
|
||
bad_words_args = ["--bad_words", " irrelevant words", " to London"]
|
||
run_and_check(llm_venv,
|
||
run_cmd + bad_words_args,
|
||
valid_outputs,
|
||
streaming=streaming)
|
||
|
||
|
||
@pytest.mark.parametrize("use_py_session", [False, True],
|
||
ids=["use_cpp_session", "use_py_session"])
|
||
@pytest.mark.parametrize("streaming", [False, True],
|
||
ids=["non_streaming", "streaming"])
|
||
def test_llm_gpt2_medium_stop_words_1gpu(gpt_example_root, llm_venv,
|
||
llm_gpt2_medium_model_root, cmodel_dir,
|
||
engine_dir, use_py_session, streaming):
|
||
"gpt2 build & run"
|
||
if use_py_session and streaming:
|
||
pytest.skip(
|
||
"Streaming with py session does not return complete sequence to reliably check stop words"
|
||
)
|
||
|
||
print("Converting checkpoint...")
|
||
dtype = 'float16'
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-medium",
|
||
model_path=llm_gpt2_medium_model_root,
|
||
data_type=dtype)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
"--paged_kv_cache=enable",
|
||
"--remove_input_padding=enable",
|
||
]
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
run_cmd = [
|
||
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--tokenizer_dir={llm_gpt2_medium_model_root}",
|
||
"--no_add_special_tokens"
|
||
]
|
||
|
||
if streaming:
|
||
run_cmd.append("--streaming")
|
||
if use_py_session:
|
||
run_cmd.append("--use_py_session")
|
||
|
||
valid_outputs = [
|
||
"chef before moving to London",
|
||
"chef and eventually became",
|
||
]
|
||
stop_words_args = ["--stop_words", " London", " became"]
|
||
run_and_check(llm_venv,
|
||
run_cmd + stop_words_args,
|
||
valid_outputs,
|
||
streaming=streaming)
|
||
|
||
stop_words_args = [
|
||
"--stop_words", " eventually became", " to London", " irrelevant output"
|
||
]
|
||
run_and_check(llm_venv,
|
||
run_cmd + stop_words_args,
|
||
valid_outputs,
|
||
streaming=streaming)
|
||
|
||
stop_words_args = [
|
||
"--stop_words", " to London", " eventually became", " irrelevant output"
|
||
]
|
||
run_and_check(llm_venv,
|
||
run_cmd + stop_words_args,
|
||
valid_outputs,
|
||
streaming=streaming)
|
||
|
||
|
||
@pytest.mark.skip_less_device(8)
|
||
@pytest.mark.parametrize(
|
||
"use_attention_plugin", [True, False],
|
||
ids=["enable_attention_plugin", "disable_attention_plugin"])
|
||
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
|
||
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
|
||
def test_llm_gpt3_175b_2layers_1node_8gpus(gpt_example_root, llm_venv,
|
||
engine_dir, use_attention_plugin,
|
||
use_gemm_plugin):
|
||
"Build & run GPT-3 175B: 2 layer w/ plugins, regression test for issues #20"
|
||
dtype = 'float16'
|
||
convert_cmd = [
|
||
f"{gpt_example_root}/../../../generate_checkpoint_config.py",
|
||
f"--output_path={engine_dir}/ckpt_config.json",
|
||
"--architecture=GPTForCausalLM", f"--dtype={dtype}",
|
||
"--num_hidden_layers=2", "--num_attention_heads=96",
|
||
"--hidden_size=12288", "--vocab_size=51200", "--tp_size=8"
|
||
]
|
||
venv_check_call(llm_venv, convert_cmd)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--model_config={engine_dir}/ckpt_config.json",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={256}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
]
|
||
|
||
if use_attention_plugin:
|
||
build_cmd.extend([f"--gpt_attention_plugin={dtype}"])
|
||
else:
|
||
build_cmd.extend([
|
||
"--gpt_attention_plugin=disable",
|
||
"--context_fmha=disable",
|
||
"--paged_kv_cache=disable",
|
||
"--remove_input_padding=disable",
|
||
])
|
||
if use_gemm_plugin:
|
||
build_cmd.extend([f"--gemm_plugin={dtype}"])
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
venv_mpi_check_call(
|
||
llm_venv,
|
||
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "8"], [
|
||
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
|
||
f"--engine_dir={engine_dir}", "--no_add_special_tokens"
|
||
])
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"use_attention_plugin", [True, False],
|
||
ids=["enable_attention_plugin", "disable_attention_plugin"])
|
||
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
|
||
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
|
||
def test_llm_gpt3_175b_96layers_build_only(gpt_example_root, llm_venv,
|
||
engine_dir, use_attention_plugin,
|
||
use_gemm_plugin):
|
||
"Build GPT-3 175B: 96 layer w/ plugins"
|
||
dtype = 'float16'
|
||
convert_cmd = [
|
||
f"{gpt_example_root}/../../../generate_checkpoint_config.py",
|
||
f"--output_path={engine_dir}/ckpt_config.json",
|
||
"--architecture=GPTForCausalLM", f"--dtype={dtype}",
|
||
"--num_hidden_layers=96", "--num_attention_heads=96",
|
||
"--hidden_size=12288", "--vocab_size=51200", "--tp_size=8"
|
||
]
|
||
venv_check_call(llm_venv, convert_cmd)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--model_config={engine_dir}/ckpt_config.json",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={64}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
]
|
||
|
||
if use_attention_plugin:
|
||
build_cmd.extend([f"--gpt_attention_plugin={dtype}"])
|
||
else:
|
||
build_cmd.extend([
|
||
"--gpt_attention_plugin=disable",
|
||
"--context_fmha=disable",
|
||
"--paged_kv_cache=disable",
|
||
"--remove_input_padding=disable",
|
||
])
|
||
if use_gemm_plugin:
|
||
build_cmd.extend([f"--gemm_plugin={dtype}"])
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
|
||
@pytest.mark.skip_less_device(8)
|
||
@pytest.mark.skip_less_device_memory(80000)
|
||
@pytest.mark.parametrize(
|
||
"use_attention_plugin", [True, False],
|
||
ids=["enable_attention_plugin", "disable_attention_plugin"])
|
||
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
|
||
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
|
||
@pytest.mark.parametrize("context_fmha", [True, False],
|
||
ids=["enable_fmha", "disable_fmha"])
|
||
@pytest.mark.parametrize("parallel_build", [True, False],
|
||
ids=["parallel_build", "serial_build"])
|
||
def test_llm_gpt3_175b_1node_8gpus(gpt_example_root, llm_venv, engine_dir,
|
||
use_attention_plugin, use_gemm_plugin,
|
||
context_fmha, parallel_build,
|
||
timeout_manager):
|
||
"Build & Run GPT-3 175B: 96 layer w/ plugins"
|
||
dtype = 'float16'
|
||
|
||
# Convert checkpoint with timeout management
|
||
with timeout_manager.timed_operation("convert"):
|
||
convert_cmd = [
|
||
f"{gpt_example_root}/../../../generate_checkpoint_config.py",
|
||
f"--output_path={engine_dir}/ckpt_config.json",
|
||
"--architecture=GPTForCausalLM", f"--dtype={dtype}",
|
||
"--num_hidden_layers=96", "--num_attention_heads=96",
|
||
"--hidden_size=12288", "--vocab_size=51200", "--tp_size=8"
|
||
]
|
||
venv_check_call(llm_venv,
|
||
convert_cmd,
|
||
timeout=timeout_manager.remaining_timeout)
|
||
|
||
# Build engines with timeout management
|
||
print("Building engines...")
|
||
with timeout_manager.timed_operation("build"):
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--model_config={engine_dir}/ckpt_config.json",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={32}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
]
|
||
|
||
if use_attention_plugin:
|
||
build_cmd.extend([f"--gpt_attention_plugin={dtype}"])
|
||
if context_fmha:
|
||
build_cmd.extend(["--context_fmha=enable"])
|
||
else:
|
||
build_cmd.extend(["--context_fmha=disable"])
|
||
else:
|
||
build_cmd.extend([
|
||
"--gpt_attention_plugin=disable",
|
||
"--context_fmha=disable",
|
||
"--paged_kv_cache=disable",
|
||
"--remove_input_padding=disable",
|
||
])
|
||
if use_gemm_plugin:
|
||
build_cmd.extend([f"--gemm_plugin={dtype}"])
|
||
if parallel_build:
|
||
build_cmd.extend(["--workers=8"])
|
||
|
||
check_call(" ".join(build_cmd),
|
||
shell=True,
|
||
env=llm_venv._new_env,
|
||
timeout=timeout_manager.remaining_timeout)
|
||
|
||
# Run inference with timeout management
|
||
print('Run gpt3-175b...')
|
||
with timeout_manager.timed_operation("run"):
|
||
venv_mpi_check_call(
|
||
llm_venv,
|
||
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "8"], [
|
||
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
|
||
f"--engine_dir={engine_dir}", "--no_add_special_tokens"
|
||
],
|
||
timeout=timeout_manager.remaining_timeout)
|
||
|
||
|
||
@skip_post_blackwell
|
||
@pytest.mark.parametrize("per_token_channel", [True, False],
|
||
ids=["enable_ptpc", "disable_ptpc"])
|
||
def test_llm_gpt2_smooth_single_gpu_summary(gpt_example_root, llm_venv,
|
||
llm_gpt2_model_root,
|
||
llm_datasets_root, llm_rouge_root,
|
||
cmodel_dir, engine_dir,
|
||
per_token_channel):
|
||
"gpt2-smooth test on single gpu"
|
||
print("Converting checkpoint...")
|
||
dtype = 'float16'
|
||
ckpt_dir = convert_weights(
|
||
llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-smooth",
|
||
model_path=llm_gpt2_model_root,
|
||
data_type=dtype,
|
||
per_token=per_token_channel,
|
||
per_channel=per_token_channel,
|
||
calib_dataset=f"{llm_datasets_root}/cimec/lambada")
|
||
|
||
print("Building engines...")
|
||
# NOTE: SQ does not support OOTB path for attention for now.
|
||
# Check tensorrt_llm/quantization/layers.py::SmoothQuantAttention for details.
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
]
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print("Running inference...")
|
||
venv_check_call(llm_venv, [
|
||
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
|
||
f"--engine_dir={engine_dir}", f"--tokenizer_dir={llm_gpt2_model_root}",
|
||
"--no_add_special_tokens"
|
||
])
|
||
|
||
|
||
@skip_post_blackwell
|
||
def test_llm_gpt2_int8_kv_1gpu(gpt_example_root, llm_venv, llm_gpt2_model_root,
|
||
llm_datasets_root, engine_dir, cmodel_dir):
|
||
"gpt2 INT8 KV Cache test on 1 gpu"
|
||
print("Converting checkpoint...")
|
||
dtype = 'float16'
|
||
ckpt_dir = convert_weights(
|
||
llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-int8-kv",
|
||
model_path=llm_gpt2_model_root,
|
||
data_type=dtype,
|
||
calib_dataset=f"{llm_datasets_root}/cimec/lambada")
|
||
|
||
print("Building engines...")
|
||
# TODO: This case only support enable gpt attention plugin.
|
||
# https://nvbugs/4175869
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
]
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print("Running inference...")
|
||
venv_check_call(llm_venv, [
|
||
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
|
||
f"--engine_dir={engine_dir}", f"--tokenizer_dir={llm_gpt2_model_root}",
|
||
"--no_add_special_tokens"
|
||
])
|
||
|
||
|
||
@skip_pre_ada
|
||
@pytest.mark.parametrize("quant_lm_head", [True, False])
|
||
@pytest.mark.parametrize("qformat", ["fp8", "fp8_pc_pt"])
|
||
def test_llm_gpt2_medium_fp8(gpt_example_root, llm_gpt2_medium_model_root,
|
||
llm_datasets_root, llm_rouge_root, llm_venv,
|
||
cmodel_dir, engine_dir, quant_lm_head, qformat):
|
||
if qformat == "fp8_pc_pt" and quant_lm_head:
|
||
pytest.skip("Skipping test for fp8_pc_pt with quant_lm_head")
|
||
"Build & Run gpt2-medium fp8 with 1 gpu"
|
||
print("Quantizing and converting checkpoint...")
|
||
dtype = "float16"
|
||
ckpt_dir = f"{cmodel_dir}/gpt2-medium/fp8/1-gpu"
|
||
|
||
quantize_cmd = [
|
||
f"{gpt_example_root}/../../../quantization/quantize.py",
|
||
f"--model_dir={llm_gpt2_medium_model_root}",
|
||
f"--calib_dataset={llm_datasets_root}/cnn_dailymail",
|
||
f"--dtype={dtype}",
|
||
f"--qformat={qformat}",
|
||
f"--output_dir={ckpt_dir}",
|
||
]
|
||
if quant_lm_head:
|
||
quantize_cmd.append("--quantize_lm_head")
|
||
venv_check_call(llm_venv, quantize_cmd)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={1}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
f"--max_num_tokens={924}",
|
||
f"--gemm_plugin={dtype}",
|
||
]
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print('Run engines...')
|
||
rouge1_threshold = 22.8 if qformat == "fp8_pc_pt" else (
|
||
20.9 if quant_lm_head else 21.7)
|
||
summary_cmd = [
|
||
f"{gpt_example_root}/../../../summarize.py",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--hf_model_dir={llm_gpt2_medium_model_root}", "--test_trt_llm",
|
||
"--check_accuracy",
|
||
f"--tensorrt_llm_rouge1_threshold={rouge1_threshold}",
|
||
f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
|
||
]
|
||
venv_check_call(llm_venv, summary_cmd)
|
||
|
||
|
||
@skip_pre_ada
|
||
@pytest.mark.skip_less_device(2)
|
||
@pytest.mark.parametrize("llm_gpt2_starcoder_model_root",
|
||
["starcoder", "starcoderplus", "starcoder2"],
|
||
indirect=True)
|
||
def test_starcoder_fp8_quantization_2gpu(gpt_example_root,
|
||
llm_gpt2_starcoder_model_root,
|
||
llm_datasets_root, llm_rouge_root,
|
||
llm_venv, cmodel_dir, engine_dir):
|
||
"Build & Run gpt2-starcoder fp8 with 2 gpus"
|
||
print("Quantizing and converting checkpoint...")
|
||
dtype = "bfloat16"
|
||
ckpt_dir = f"{cmodel_dir}/gpt2-starcoder/fp8/2-gpu"
|
||
|
||
tp_size, pp_size = 2, 1
|
||
world_size = tp_size * pp_size
|
||
quantize_cmd = [
|
||
f"{gpt_example_root}/../../../quantization/quantize.py",
|
||
f"--model_dir={llm_gpt2_starcoder_model_root}",
|
||
f"--calib_dataset={llm_datasets_root}/cnn_dailymail",
|
||
f"--dtype={dtype}",
|
||
"--qformat=fp8",
|
||
"--kv_cache_dtype=fp8",
|
||
f"--calib_tp_size={tp_size}",
|
||
f"--tp_size={tp_size}",
|
||
f"--output_dir={ckpt_dir}",
|
||
]
|
||
venv_check_call(llm_venv, quantize_cmd)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={1}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
f"--max_num_tokens={924}",
|
||
f"--gemm_plugin={dtype}",
|
||
f"--workers={world_size}",
|
||
]
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print('Run engines...')
|
||
summary_cmd = [
|
||
f"{gpt_example_root}/../../../summarize.py",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--hf_model_dir={llm_gpt2_starcoder_model_root}", "--test_trt_llm",
|
||
"--check_accuracy", "--tensorrt_llm_rouge1_threshold=17.5",
|
||
f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
|
||
]
|
||
venv_mpi_check_call(
|
||
llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
|
||
summary_cmd)
|
||
|
||
|
||
def test_llm_gpt2_next_1gpu(gpt_example_root, llm_venv,
|
||
llm_gpt2_next_model_root, engine_dir, cmodel_dir):
|
||
"RoPE is only supported with GPTAttention plugin"
|
||
print("Converting checkpoint...")
|
||
dtype = "bfloat16"
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-next",
|
||
model_path=llm_gpt2_next_model_root,
|
||
data_type=dtype)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
]
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print("Running inference...")
|
||
venv_check_call(llm_venv, [
|
||
f"{gpt_example_root}/../../../run.py", "--max_output_len=8",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--vocab_file={ckpt_dir}/tokenizer.model", "--no_add_special_tokens"
|
||
])
|
||
|
||
|
||
# transformers compatibility issues
|
||
@pytest.mark.parametrize("tensor_parallel", [1, 2], ids=["tp1", "tp2"])
|
||
@pytest.mark.parametrize("use_py_session", [False, True],
|
||
ids=["use_cpp_session", "use_py_session"])
|
||
def test_llm_gpt2_next_prompt_tuning(gpt_example_root, llm_venv,
|
||
llm_gpt2_next_model_root, cmodel_dir,
|
||
engine_dir, tensor_parallel,
|
||
use_py_session):
|
||
f"gpt-next prompt tuning on {tensor_parallel} gpu(s)"
|
||
dtype = "bfloat16"
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-next",
|
||
model_path=llm_gpt2_next_model_root,
|
||
gpus=tensor_parallel,
|
||
tp_size=tensor_parallel,
|
||
data_type=dtype)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size=4",
|
||
f"--max_input_len=924",
|
||
f"--max_seq_len=1024",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
"--max_prompt_embedding_table_size=200",
|
||
]
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print("Converting prompt-tuning table...")
|
||
squad_table_nemo = Path(llm_gpt2_next_model_root
|
||
).parent / "p-tuning" / "gpt2b_gpt2-squad-vt60.nemo"
|
||
squad_table = Path(gpt_example_root) / "prompt_table_squad.npy"
|
||
train900_table_nemo = Path(
|
||
llm_gpt2_next_model_root
|
||
).parent / "p-tuning" / "gpt2b_gpt2b-train900-v2.nemo"
|
||
train900_table = Path(gpt_example_root) / "prompt_table_train900.npy"
|
||
for (in_file, out_file) in [(squad_table_nemo, squad_table),
|
||
(train900_table_nemo, train900_table)]:
|
||
table_conv_cmd = [
|
||
f"{gpt_example_root}/nemo_prompt_convert.py", "-i",
|
||
str(in_file), "-o",
|
||
str(out_file)
|
||
]
|
||
venv_check_call(llm_venv, table_conv_cmd)
|
||
|
||
merged_table = Path(gpt_example_root) / "prompt_table_train900.npy"
|
||
table_merge_cmd = [
|
||
f"{gpt_example_root}/merge_ptuning_tables.py",
|
||
str(squad_table),
|
||
str(train900_table),
|
||
str(merged_table)
|
||
]
|
||
venv_check_call(llm_venv, table_merge_cmd)
|
||
|
||
inference_params = {
|
||
"squad": {
|
||
"num_v_tokens":
|
||
50,
|
||
"input":
|
||
"Context: In Hinduism the spiritual teacher is known as a guru, and, in many traditions of Hinduism - especially those common in the West - the emphasis on spiritual mentorship is extremely high, with gurus often exercising a great deal of control over the lives of their disciples.\n\nQuestion: Who do gurus control?\n\nAnswer:",
|
||
"outputs": [
|
||
"The answer is, of course, the disciple.",
|
||
"The guru controls the disciple's life, but",
|
||
"The guru is the one who controls the disciple."
|
||
],
|
||
},
|
||
"train900": {
|
||
"num_v_tokens": 20,
|
||
"input":
|
||
"Context: Carlsen faced Anand in the World Chess Championship 2013, at Hyatt Regency in Chennai, India, from 9 to 22 November. Carlsen won the match 6.5–3.5 by winning games five, six and nine and drawing the remainder, becoming the new World Chess Champion.\n\nQuestion: When did Carlsen become World Chess Champion?\n\nAnswer:",
|
||
"outputs":
|
||
["2013", "2013" + os.linesep + os.linesep + "Question: Who"],
|
||
}
|
||
}
|
||
|
||
print("Running inference...")
|
||
|
||
def parse_output(text: str) -> list[str]:
|
||
results = []
|
||
while True:
|
||
match = re.search(
|
||
r"Output \[Text \d+ Beam \d+\]: \"([^\"]*)\"" + os.linesep,
|
||
text, re.MULTILINE)
|
||
if match is None:
|
||
break
|
||
_, end = match.span()
|
||
results.append(match.group(1))
|
||
text = text[end:]
|
||
return results
|
||
|
||
# test model without p-tuning dict
|
||
run_cmd = [
|
||
f"{gpt_example_root}/../../../run.py",
|
||
"--no_add_special_tokens",
|
||
"--max_output_len=10",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--vocab_file={ckpt_dir}/tokenizer.model",
|
||
f"--input_text={inference_params['squad']['input']}",
|
||
]
|
||
|
||
if use_py_session:
|
||
run_cmd.append("--use_py_session")
|
||
|
||
output = venv_mpi_check_output(
|
||
llm_venv, ["mpirun", "-n", f"{tensor_parallel}", "--allow-run-as-root"],
|
||
run_cmd)
|
||
assert any(
|
||
similar(parse_output(output)[0][:len(ref) + 1], ref)
|
||
for ref in inference_params["squad"]["outputs"]), "incorrect output"
|
||
|
||
# test p-tuning task separately
|
||
run_cmd = [
|
||
f"{gpt_example_root}/../../../run.py",
|
||
"--no_add_special_tokens",
|
||
"--max_output_len=10",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--vocab_file={ckpt_dir}/tokenizer.model",
|
||
f"--prompt_table={squad_table}",
|
||
f"--num_prepend_vtokens={inference_params['squad']['num_v_tokens']}",
|
||
f"--input_text={inference_params['squad']['input']}",
|
||
f"--no-kv_cache_enable_block_reuse",
|
||
]
|
||
|
||
if use_py_session:
|
||
run_cmd.append("--use_py_session")
|
||
|
||
output = venv_mpi_check_output(
|
||
llm_venv, ["mpirun", "-n", f"{tensor_parallel}", "--allow-run-as-root"],
|
||
run_cmd)
|
||
assert any(
|
||
similar(parse_output(output)[0][:len(ref) + 1], ref)
|
||
for ref in inference_params["squad"]["outputs"]), "incorrect output"
|
||
|
||
run_cmd = [
|
||
f"{gpt_example_root}/../../../run.py",
|
||
"--no_add_special_tokens",
|
||
"--max_output_len=10",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--vocab_file={ckpt_dir}/tokenizer.model",
|
||
f"--prompt_table={train900_table}",
|
||
f"--num_prepend_vtokens={inference_params['train900']['num_v_tokens']}",
|
||
f"--input_text={inference_params['train900']['input']}",
|
||
f"--no-kv_cache_enable_block_reuse",
|
||
]
|
||
|
||
if use_py_session:
|
||
run_cmd.append("--use_py_session")
|
||
|
||
output = venv_mpi_check_output(
|
||
llm_venv, ["mpirun", "-n", f"{tensor_parallel}", "--allow-run-as-root"],
|
||
run_cmd)
|
||
assert any(
|
||
similar(parse_output(output)[0][:len(ref) + 1], ref)
|
||
for ref in inference_params["train900"]["outputs"]), "incorrect output"
|
||
|
||
# test batched p-tuning tasks
|
||
run_cmd = [
|
||
f"{gpt_example_root}/../../../run.py",
|
||
"--no_add_special_tokens",
|
||
"--max_output_len=10",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--vocab_file={ckpt_dir}/tokenizer.model",
|
||
f"--prompt_table={merged_table}",
|
||
f"--num_prepend_vtokens",
|
||
str(inference_params['squad']['num_v_tokens']),
|
||
str(inference_params['train900']['num_v_tokens']),
|
||
f"--prompt_tasks=0,1",
|
||
f"--input_text",
|
||
inference_params["squad"]["input"],
|
||
inference_params['train900']['input'],
|
||
f"--no-kv_cache_enable_block_reuse",
|
||
]
|
||
|
||
if use_py_session:
|
||
run_cmd.append("--use_py_session")
|
||
|
||
output = venv_mpi_check_output(
|
||
llm_venv, ["mpirun", "-n", f"{tensor_parallel}", "--allow-run-as-root"],
|
||
run_cmd)
|
||
|
||
outputs = parse_output(output)
|
||
assert any(
|
||
similar(outputs[0][:len(ref) + 1], ref)
|
||
for ref in inference_params["squad"]["outputs"]), "incorrect output"
|
||
assert any(
|
||
similar(outputs[1][:len(ref) + 1], ref)
|
||
for ref in inference_params["train900"]["outputs"]), "incorrect output"
|
||
|
||
# test batched and streamed p-tuning tasks
|
||
# Streaming with py session does not return complete sequence to reliably check stop words"
|
||
|
||
if not use_py_session and tensor_parallel == 1:
|
||
run_cmd = [
|
||
f"{gpt_example_root}/../../../run.py",
|
||
"--no_add_special_tokens",
|
||
"--max_output_len=10",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--vocab_file={ckpt_dir}/tokenizer.model",
|
||
f"--prompt_table={merged_table}",
|
||
f"--num_prepend_vtokens",
|
||
str(inference_params['squad']['num_v_tokens']),
|
||
str(inference_params['train900']['num_v_tokens']),
|
||
f"--prompt_tasks=0,1",
|
||
"--streaming",
|
||
f"--input_text",
|
||
inference_params["squad"]["input"],
|
||
inference_params['train900']['input'],
|
||
f"--no-kv_cache_enable_block_reuse",
|
||
]
|
||
|
||
output = venv_mpi_check_output(
|
||
llm_venv,
|
||
["mpirun", "-n", f"{tensor_parallel}", "--allow-run-as-root"],
|
||
run_cmd)
|
||
|
||
outputs = parse_output(output)
|
||
squad_outputs = outputs[::2]
|
||
train900_outputs = outputs[1::2]
|
||
for outputs, valid_outputs in [
|
||
(squad_outputs, inference_params["squad"]["outputs"]),
|
||
(train900_outputs, inference_params["train900"]["outputs"])
|
||
]:
|
||
assert any(
|
||
similar(outputs[-1][:len(ref) + 1], ref)
|
||
for ref in valid_outputs), "incorrect output"
|
||
similarities = []
|
||
for suboutput in outputs:
|
||
similarities.append(
|
||
max([
|
||
similarity_score(suboutput, expect)
|
||
for expect in valid_outputs
|
||
]))
|
||
assert (
|
||
all(x <= y for x, y in zip(similarities, similarities[1:]))
|
||
), f"streaming outputs must have a monotonically increasing similarity score. valid_outputs: {valid_outputs}, outputs: {outputs}, similarities: {similarities}"
|
||
|
||
|
||
@pytest.mark.skip_less_device(4)
|
||
@pytest.mark.parametrize(
|
||
"tp_pp_size", [(4, 1), (2, 2), (1, 4)],
|
||
ids=lambda tp_pp_size: f'tp{tp_pp_size[0]}pp{tp_pp_size[1]}')
|
||
def test_llm_gpt2_medium_1node_4gpus(gpt_example_root,
|
||
llm_gpt2_medium_model_root,
|
||
llm_datasets_root, llm_rouge_root,
|
||
llm_venv, cmodel_dir, engine_dir,
|
||
tp_pp_size):
|
||
print("Converting checkpoint...")
|
||
dtype = 'float16'
|
||
tp_size, pp_size = tp_pp_size
|
||
world_size = tp_size * pp_size
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-medium",
|
||
model_path=llm_gpt2_medium_model_root,
|
||
data_type=dtype,
|
||
gpus=world_size,
|
||
tp_size=tp_size,
|
||
pp_size=pp_size,
|
||
workers=world_size)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
"--max_batch_size=8",
|
||
"--max_input_len=924",
|
||
"--max_seq_len=1024",
|
||
f"--gemm_plugin={dtype}",
|
||
f"--workers={world_size}",
|
||
]
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print("Run engines...")
|
||
summary_cmd = [
|
||
f"{gpt_example_root}/../../../summarize.py", "--test_trt_llm",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--hf_model_dir={llm_gpt2_medium_model_root}", "--check_accuracy",
|
||
"--tensorrt_llm_rouge1_threshold=19",
|
||
f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
|
||
]
|
||
venv_mpi_check_call(
|
||
llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
|
||
summary_cmd)
|
||
|
||
|
||
@pytest.mark.skip_less_device(4)
|
||
@pytest.mark.parametrize(
|
||
"use_attention_plugin", [True, False],
|
||
ids=["enable_attention_plugin", "disable_attention_plugin"])
|
||
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
|
||
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
|
||
@pytest.mark.parametrize("context_fmha", [True, False],
|
||
ids=["enable_fmha", "disable_fmha"])
|
||
@pytest.mark.parametrize("parallel_build", [True, False],
|
||
ids=["parallel_build", "serial_build"])
|
||
def test_llm_gpt2_santacoder_1node_4gpus(gpt_example_root,
|
||
llm_gpt2_santacoder_model_root,
|
||
llm_venv, engine_dir, cmodel_dir,
|
||
use_attention_plugin, use_gemm_plugin,
|
||
context_fmha, parallel_build):
|
||
"Build & Run GPT2 variant santacoder"
|
||
print("Converting checkpoint...")
|
||
dtype = 'float16'
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-santacoder",
|
||
model_path=llm_gpt2_santacoder_model_root,
|
||
data_type=dtype,
|
||
gpus=4,
|
||
tp_size=4)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
]
|
||
|
||
if use_attention_plugin:
|
||
build_cmd.extend([f"--gpt_attention_plugin={dtype}"])
|
||
if context_fmha:
|
||
build_cmd.extend(["--context_fmha=enable"])
|
||
else:
|
||
build_cmd.extend(["--context_fmha=disable"])
|
||
else:
|
||
build_cmd.extend([
|
||
"--gpt_attention_plugin=disable",
|
||
"--context_fmha=disable",
|
||
"--paged_kv_cache=disable",
|
||
"--remove_input_padding=disable",
|
||
])
|
||
if use_gemm_plugin:
|
||
build_cmd.extend([f"--gemm_plugin={dtype}"])
|
||
if parallel_build:
|
||
build_cmd.extend(["--workers=4"])
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print('Run gpt2-santacoder...')
|
||
venv_mpi_check_call(
|
||
llm_venv,
|
||
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "4"], [
|
||
f"{gpt_example_root}/../../../run.py", "--max_output_len=20",
|
||
f"--engine_dir={engine_dir}", "--tokenizer_dir",
|
||
llm_gpt2_santacoder_model_root, "--input_text",
|
||
"def print_hello_world():", "--no_add_special_tokens"
|
||
])
|
||
|
||
|
||
@pytest.mark.skip_less_device(4)
|
||
@pytest.mark.parametrize(
|
||
"use_attention_plugin", [True, False],
|
||
ids=["enable_attention_plugin", "disable_attention_plugin"])
|
||
@pytest.mark.parametrize("use_gemm_plugin", [True, False],
|
||
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
|
||
@pytest.mark.parametrize("context_fmha", [True, False],
|
||
ids=["enable_fmha", "disable_fmha"])
|
||
@pytest.mark.parametrize("llm_gpt2_starcoder_model_root",
|
||
["starcoder", "starcoderplus", "starcoder2"],
|
||
indirect=True)
|
||
def test_llm_gpt2_starcoder_1node_4gpus(gpt_example_root,
|
||
llm_gpt2_starcoder_model_root,
|
||
llm_datasets_root, llm_rouge_root,
|
||
llm_venv, cmodel_dir, engine_dir,
|
||
use_attention_plugin, use_gemm_plugin,
|
||
context_fmha):
|
||
"Build & Run GPT2 variant starcoder"
|
||
print("Converting checkpoint...")
|
||
dtype = 'float16'
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-starcoder",
|
||
model_path=llm_gpt2_starcoder_model_root,
|
||
data_type=dtype,
|
||
gpus=4,
|
||
tp_size=4)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
"--workers=4",
|
||
]
|
||
|
||
if use_attention_plugin:
|
||
build_cmd.extend([f"--gpt_attention_plugin={dtype}"])
|
||
if context_fmha:
|
||
build_cmd.extend(["--context_fmha=enable"])
|
||
else:
|
||
build_cmd.extend(["--context_fmha=disable"])
|
||
else:
|
||
build_cmd.extend([
|
||
"--gpt_attention_plugin=disable",
|
||
"--context_fmha=disable",
|
||
"--paged_kv_cache=disable",
|
||
"--remove_input_padding=disable",
|
||
])
|
||
if use_gemm_plugin:
|
||
build_cmd.extend([f"--gemm_plugin={dtype}"])
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print('Run gpt2-starcoder...')
|
||
venv_mpi_check_call(
|
||
llm_venv,
|
||
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "4"], [
|
||
f"{gpt_example_root}/../../../run.py",
|
||
"--max_output_len=20",
|
||
f"--engine_dir={engine_dir}",
|
||
"--tokenizer_dir",
|
||
llm_gpt2_starcoder_model_root,
|
||
"--input_text",
|
||
"def print_hello_world():",
|
||
"--no_add_special_tokens",
|
||
])
|
||
|
||
summary_cmd = generate_summary_cmd(
|
||
gpt_example_root,
|
||
"no_add_special_tokens",
|
||
batch_size=1,
|
||
engine_dir=engine_dir,
|
||
eval_task="code_completion",
|
||
hf_model_dir=llm_gpt2_starcoder_model_root,
|
||
max_attention_window_size=4096,
|
||
tensorrt_llm_rouge1_threshold=25,
|
||
dataset_dir=llm_datasets_root,
|
||
rouge_dir=llm_rouge_root)
|
||
|
||
print('Run gpt2-starcoder summarize...')
|
||
venv_mpi_check_call(
|
||
llm_venv,
|
||
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "4"],
|
||
summary_cmd)
|
||
|
||
|
||
@skip_post_blackwell
|
||
@pytest.mark.skip_less_host_memory(250000)
|
||
def test_llm_gpt2_starcoder_1gpus(gpt_example_root,
|
||
llm_gpt2_starcoder_model_root, llm_venv,
|
||
engine_dir, cmodel_dir):
|
||
"Build & Run GPT2 variant starcoder on single gpu"
|
||
print("Converting checkpoint...")
|
||
print(f"cmodel dir is {cmodel_dir}")
|
||
dtype = 'float16'
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-starcoder",
|
||
model_path=llm_gpt2_starcoder_model_root,
|
||
data_type=dtype)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
"--context_fmha=enable",
|
||
f"--gemm_plugin={dtype}",
|
||
]
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print('Run gpt2-starcoder...')
|
||
summary_cmd = [
|
||
f"{gpt_example_root}/../../../run.py", "--max_output_len=20",
|
||
f"--engine_dir={engine_dir}", "--tokenizer_dir",
|
||
llm_gpt2_starcoder_model_root, "--input_text",
|
||
"def print_hello_world():", "--no_add_special_tokens"
|
||
]
|
||
|
||
venv_check_call(llm_venv, summary_cmd)
|
||
|
||
|
||
@skip_post_blackwell
|
||
@pytest.mark.skip_less_host_memory(250000)
|
||
@pytest.mark.parametrize("dtype", ["float16"])
|
||
@pytest.mark.parametrize("precision", ["int8", "int4"])
|
||
@pytest.mark.parametrize("llm_gpt2_starcoder_model_root",
|
||
["starcoder", "starcoderplus", "starcoder2"],
|
||
indirect=True)
|
||
def test_llm_gpt2_starcoder_weight_only(gpt_example_root,
|
||
llm_gpt2_starcoder_model_root,
|
||
llm_datasets_root, llm_rouge_root,
|
||
llm_venv, cmodel_dir, engine_dir, dtype,
|
||
precision):
|
||
"Build & Run GPT2 variant starcoder with int8/int4 weight only"
|
||
|
||
print("Converting checkpoint...")
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-starcoder",
|
||
model_path=llm_gpt2_starcoder_model_root,
|
||
data_type=dtype,
|
||
use_weight_only=True,
|
||
weight_only_precision=precision)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={924}",
|
||
f"--max_seq_len={1024}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
"--context_fmha=enable",
|
||
f"--gemm_plugin={dtype}",
|
||
]
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print('Run gpt2-starcoder...')
|
||
summary_cmd = [
|
||
f"{gpt_example_root}/../../../run.py",
|
||
"--max_output_len=20",
|
||
f"--engine_dir={engine_dir}",
|
||
"--tokenizer_dir",
|
||
llm_gpt2_starcoder_model_root,
|
||
"--input_text",
|
||
"def print_hello_world():",
|
||
"--no_add_special_tokens",
|
||
]
|
||
|
||
venv_check_call(llm_venv, summary_cmd)
|
||
|
||
summary_cmd = generate_summary_cmd(
|
||
gpt_example_root,
|
||
"no_add_special_tokens",
|
||
batch_size=1,
|
||
engine_dir=engine_dir,
|
||
eval_task="code_completion",
|
||
hf_model_dir=llm_gpt2_starcoder_model_root,
|
||
max_attention_window_size=4096,
|
||
tensorrt_llm_rouge1_threshold=25,
|
||
dataset_dir=llm_datasets_root,
|
||
rouge_dir=llm_rouge_root)
|
||
|
||
print('Run gpt2-starcoder summarize...')
|
||
venv_check_call(llm_venv, summary_cmd)
|
||
|
||
|
||
@pytest.mark.parametrize("tensor_parallel", [1, 2], ids=["tp1", "tp2"])
|
||
@pytest.mark.parametrize("dtype", ["float16", "bfloat16"])
|
||
def test_llm_gpt2_starcoder2(gpt_example_root, llm_gpt2_starcoder2_model_root,
|
||
llm_datasets_root, llm_rouge_root, llm_venv,
|
||
cmodel_dir, engine_dir, dtype, tensor_parallel):
|
||
"Build & Run GPT2 variant starcoder2 on single gpu"
|
||
print("Converting checkpoint...")
|
||
print(f"cmodel dir is {cmodel_dir}")
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-starcoder2",
|
||
model_path=llm_gpt2_starcoder2_model_root,
|
||
data_type=dtype,
|
||
gpus=tensor_parallel,
|
||
tp_size=tensor_parallel)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={1}",
|
||
f"--max_input_len={1024}",
|
||
f"--max_seq_len={1024}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
"--context_fmha=enable",
|
||
f"--gemm_plugin={dtype}",
|
||
]
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print('Run gpt2-starcoder...')
|
||
venv_mpi_check_call(
|
||
llm_venv,
|
||
parse_mpi_cmd([
|
||
"mpirun", "--allow-run-as-root", "--oversubscribe", "-np",
|
||
str(tensor_parallel)
|
||
]), [
|
||
f"{gpt_example_root}/../../../summarize.py", "--batch_size=1",
|
||
f"--engine_dir={engine_dir}", "--test_trt_llm", "--check_accuracy",
|
||
"--eval_task=code_completion",
|
||
f"--hf_model_dir={llm_gpt2_starcoder2_model_root}",
|
||
"--no_add_special_tokens", "--max_attention_window_size=4096",
|
||
"--tensorrt_llm_rouge1_threshold=25",
|
||
f"--dataset_dir={llm_datasets_root}",
|
||
f"--rouge_dir={llm_rouge_root}"
|
||
])
|
||
|
||
|
||
@pytest.mark.parametrize("qformat", ["fp8", "full_prec"])
|
||
@pytest.mark.parametrize("dtype", ["float16", "bfloat16"])
|
||
@pytest.mark.parametrize("minitron_model_root", ["4b"], indirect=True)
|
||
def test_llm_minitron(gpt_example_root, minitron_model_root, llm_datasets_root,
|
||
llm_rouge_root, llm_venv, cmodel_dir, engine_dir, dtype,
|
||
qformat):
|
||
skip_fp8_pre_ada(qformat == 'fp8')
|
||
"Build & Run GPT2 variant minitron on single gpu"
|
||
|
||
if qformat == 'fp8':
|
||
print("Quantizing and converting checkpoint...")
|
||
ckpt_dir = f"{cmodel_dir}/minitron/fp8/1-gpu"
|
||
|
||
quantize_cmd = [
|
||
f"{gpt_example_root}/../../../quantization/quantize.py",
|
||
f"--model_dir={minitron_model_root}",
|
||
f"--calib_dataset={llm_datasets_root}/cnn_dailymail",
|
||
f"--dtype={dtype}",
|
||
"--qformat=fp8",
|
||
"--kv_cache_dtype=fp8",
|
||
f"--output_dir={ckpt_dir}",
|
||
]
|
||
venv_check_call(llm_venv, quantize_cmd)
|
||
else:
|
||
print(f"Converting checkpoint...")
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-minitron",
|
||
model_path=minitron_model_root,
|
||
data_type=dtype,
|
||
gpus=1,
|
||
tp_size=1)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={1}",
|
||
f"--max_input_len={1024}",
|
||
f"--max_seq_len={1024}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
"--context_fmha=enable",
|
||
f"--gemm_plugin={dtype}",
|
||
]
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print('Run Minitron...')
|
||
venv_mpi_check_call(
|
||
llm_venv,
|
||
parse_mpi_cmd(
|
||
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "1"]), [
|
||
f"{gpt_example_root}/../../../summarize.py", "--batch_size=1",
|
||
f"--engine_dir={engine_dir}", "--test_trt_llm",
|
||
"--check_accuracy", "--eval_task", "code_completion",
|
||
"--hf_model_dir", minitron_model_root,
|
||
"--no_add_special_tokens", "--max_attention_window_size=4096",
|
||
"--tensorrt_llm_rouge1_threshold=29",
|
||
f"--dataset_dir={llm_datasets_root}",
|
||
f"--rouge_dir={llm_rouge_root}"
|
||
])
|
||
|
||
|
||
@pytest.mark.skip_less_device(2)
|
||
@pytest.mark.parametrize("embedding_sharding_dim", [0, 1])
|
||
@pytest.mark.parametrize("dtype", ["float16"])
|
||
def test_llm_gpt2_parallel_embedding_2gpu(gpt_example_root, llm_venv,
|
||
llm_gpt2_model_root,
|
||
llm_datasets_root, llm_rouge_root,
|
||
cmodel_dir, engine_dir,
|
||
embedding_sharding_dim, dtype):
|
||
"GPT2 with parallel embedding"
|
||
print("Converting checkpoint...")
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2",
|
||
model_path=llm_gpt2_model_root,
|
||
data_type=dtype,
|
||
gpus=2,
|
||
tp_size=2,
|
||
use_parallel_embedding=True,
|
||
embedding_sharding_dim=embedding_sharding_dim)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_input_len={1000}",
|
||
f"--max_seq_len={1024}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
"--workers=2",
|
||
]
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print("Running inference...")
|
||
venv_mpi_check_call(llm_venv, [
|
||
"mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "2"
|
||
], [
|
||
f"{gpt_example_root}/../../../summarize.py", "--batch_size=8",
|
||
"--test_trt_llm", "--check_accuracy",
|
||
"--tensorrt_llm_rouge1_threshold=13.5", f"--engine_dir={engine_dir}",
|
||
f"--hf_model_dir={llm_gpt2_model_root}", "--no_add_special_tokens",
|
||
f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
|
||
])
|
||
|
||
|
||
@pytest.mark.parametrize("llm_gpt2b_lora_model_root",
|
||
[("gpt2b_lora-900.nemo", "gpt2b_lora-stories.nemo")],
|
||
ids=["900_stories"],
|
||
indirect=True)
|
||
def test_llm_gpt2_multi_lora_1gpu(gpt_example_root, llm_venv,
|
||
llm_gpt2_next_model_root, cmodel_dir,
|
||
engine_dir, llm_gpt2b_lora_model_root):
|
||
"gpt2 run lora with nemo checkpoint on 1 gpu"
|
||
print("Converting checkpoint...")
|
||
dtype = "float16"
|
||
ckpt_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model="gpt2-next-lora",
|
||
model_path=llm_gpt2_next_model_root,
|
||
data_type=dtype)
|
||
|
||
print("Building engines...")
|
||
lora_900, lora_stories = llm_gpt2b_lora_model_root.split(",")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={4}",
|
||
f"--max_input_len={512}",
|
||
f"--max_seq_len={562}",
|
||
f"--max_beam_width={2}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
"--remove_input_padding=enable",
|
||
"--paged_kv_cache=enable",
|
||
"--context_fmha=enable",
|
||
f"--lora_plugin={dtype}",
|
||
"--lora_dir",
|
||
lora_900,
|
||
lora_stories,
|
||
"--lora_ckpt_source=nemo",
|
||
"--lora_target_modules",
|
||
"attn_qkv",
|
||
]
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
run_cmd = [
|
||
f"{gpt_example_root}/../../../run.py",
|
||
"--max_output_len=20",
|
||
"--use_py_session",
|
||
f"--vocab_file={ckpt_dir}/tokenizer.model",
|
||
f"--engine_dir={engine_dir}",
|
||
"--lora_task_uids",
|
||
"0",
|
||
"-1",
|
||
"1",
|
||
"--no_add_special_tokens",
|
||
"--input_text",
|
||
INPUT_TEXT_1,
|
||
INPUT_TEXT_2,
|
||
INPUT_TEXT_2,
|
||
]
|
||
|
||
output = venv_check_output(llm_venv, run_cmd)
|
||
output = parse_output(output)
|
||
expected_output = [
|
||
[
|
||
"He surprised the Canadians on May 28 in what became known as the Battle of Jumonville",
|
||
"Washington, with Tanaghrisson and his party, surprised the Canadians on May 28 in"
|
||
],
|
||
[
|
||
"The game is played with a deck of cards, and the player who has the most"
|
||
],
|
||
[
|
||
"You are a wizard who is a wizard. You are a wizard who is",
|
||
'The job title is "Spellcaster" and the job description is "Spell"'
|
||
],
|
||
]
|
||
|
||
for idx, result in enumerate(output):
|
||
assert any([similar(item, result)
|
||
for item in expected_output[idx]]), f"output is {output}"
|
||
|
||
|
||
@skip_post_blackwell
|
||
@pytest.mark.skip_less_device_memory(50000)
|
||
@pytest.mark.parametrize("data_type", ['float16', 'fp8'],
|
||
ids=['base_fp16', 'base_fp8'])
|
||
@pytest.mark.parametrize("lora_data_type", ['float16'], ids=['lora_fp16'])
|
||
@pytest.mark.parametrize("llm_gpt2_starcoder_model_root", ['starcoder2'],
|
||
indirect=True)
|
||
@pytest.mark.parametrize("llm_lora_model_root",
|
||
['peft-lora-starcoder2-15b-unity-copilot'],
|
||
indirect=True)
|
||
def test_llm_gpt_starcoder_lora_1gpu(data_type, lora_data_type,
|
||
gpt_example_root,
|
||
llm_gpt2_starcoder_model_root,
|
||
llm_datasets_root, llm_venv, cmodel_dir,
|
||
engine_dir, llm_lora_model_root,
|
||
qcache_dir):
|
||
"run starcoder2 lora test on 1gpu"
|
||
if data_type == 'fp8':
|
||
skip_fp8_pre_ada(use_fp8=True)
|
||
else:
|
||
if get_device_memory() < 80000:
|
||
pytest.skip("GPU memory is not sufficient.")
|
||
|
||
print("Converting checkpoint...")
|
||
model_name = 'starcoder2-lora'
|
||
|
||
if data_type == 'fp8':
|
||
model_dir = quantize_data(
|
||
llm_venv,
|
||
gpt_example_root,
|
||
model_dir=llm_gpt2_starcoder_model_root,
|
||
calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
|
||
dtype="float16",
|
||
qformat="fp8",
|
||
kv_cache_dtype="fp8",
|
||
quantize_dir=qcache_dir,
|
||
calib_size=512)
|
||
else:
|
||
model_dir = convert_weights(llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
cmodel_dir=cmodel_dir,
|
||
model=model_name,
|
||
model_path=llm_gpt2_starcoder_model_root)
|
||
|
||
print("Build engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={model_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
"--lora_plugin=auto",
|
||
"--gemm_plugin=auto",
|
||
f"--lora_dir={llm_lora_model_root}",
|
||
]
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
ref_1 = [
|
||
610, 1489, 100, 7670, 100, 5879, 2284, 303, 1489, 459, 8302, 10914,
|
||
16013, 222, 222, 610, 1489, 100, 7670, 100, 5879, 100, 115, 100, 5598,
|
||
45, 115
|
||
]
|
||
ref_2 = [
|
||
610, 1489, 100, 7670, 100, 5879, 2284, 303, 1489, 459, 8302, 10914, 678,
|
||
222, 222, 610, 1489, 100, 7670, 100, 5879, 100, 115, 100, 5598, 45, 115
|
||
]
|
||
|
||
input_text = "def print_hello_world():"
|
||
|
||
print(f"Run inference with lora id 0...")
|
||
venv_check_call(llm_venv, [
|
||
f"{gpt_example_root}/../../../run.py",
|
||
"--max_output_len=20",
|
||
f"--input_text={input_text}",
|
||
"--lora_task_uids=0",
|
||
f"--tokenizer_dir={llm_gpt2_starcoder_model_root}",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--output_csv={llm_venv.get_working_directory()}/use_lora.csv",
|
||
"--no_add_special_tokens",
|
||
"--use_py_session",
|
||
])
|
||
|
||
with open(f"{llm_venv.get_working_directory()}/use_lora.csv") as f:
|
||
predict = csv.reader(f)
|
||
predict = next(predict)
|
||
predict = [int(p) for p in predict]
|
||
assert ref_1 == predict or data_type != "float16"
|
||
|
||
print(f"Run inference with lora id -1...")
|
||
venv_check_call(llm_venv, [
|
||
f"{gpt_example_root}/../../../run.py",
|
||
"--max_output_len=20",
|
||
f"--input_text={input_text}",
|
||
"--lora_task_uids=-1",
|
||
f"--tokenizer_dir={llm_gpt2_starcoder_model_root}",
|
||
f"--engine_dir={engine_dir}",
|
||
f"--output_csv={llm_venv.get_working_directory()}/no_lora.csv",
|
||
"--no_add_special_tokens",
|
||
"--use_py_session",
|
||
])
|
||
|
||
with open(f"{llm_venv.get_working_directory()}/no_lora.csv") as f:
|
||
predict = csv.reader(f)
|
||
predict = next(predict)
|
||
predict = [int(p) for p in predict]
|
||
assert ref_2 == predict or data_type != "float16"
|
||
|
||
|
||
@pytest.mark.parametrize("llm_gpt2_starcoder_model_root", ['starcoder2'],
|
||
indirect=True)
|
||
def test_llm_starcoder2_sqootb_single_gpu(gpt_example_root, llm_venv,
|
||
llm_gpt2_starcoder_model_root,
|
||
llm_datasets_root, llm_rouge_root,
|
||
cmodel_dir, engine_dir):
|
||
"Starcoder2-smooth test on single gpu"
|
||
print("Quantization...")
|
||
dtype = 'float16'
|
||
ckpt_dir = f"{cmodel_dir}/starcoder2/int8_sq/1-gpu"
|
||
|
||
quantize_cmd = [
|
||
f"{gpt_example_root}/../../../quantization/quantize.py",
|
||
f"--model_dir={llm_gpt2_starcoder_model_root}",
|
||
f"--calib_dataset={llm_datasets_root}/cnn_dailymail",
|
||
f"--dtype={dtype}",
|
||
"--qformat=int8_sq",
|
||
f"--output_dir={ckpt_dir}",
|
||
]
|
||
venv_check_call(llm_venv, quantize_cmd)
|
||
|
||
print("Building engines...")
|
||
build_cmd = [
|
||
"trtllm-build",
|
||
f"--checkpoint_dir={ckpt_dir}",
|
||
f"--output_dir={engine_dir}",
|
||
f"--max_batch_size={8}",
|
||
f"--max_seq_len={4096}",
|
||
f"--gpt_attention_plugin={dtype}",
|
||
]
|
||
|
||
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
||
|
||
print('Run starcoder2...')
|
||
venv_mpi_check_call(
|
||
llm_venv,
|
||
parse_mpi_cmd(
|
||
["mpirun", "--allow-run-as-root", "--oversubscribe", "-np", "1"]), [
|
||
f"{gpt_example_root}/../../../summarize.py", "--batch_size=1",
|
||
f"--engine_dir={engine_dir}", "--test_trt_llm",
|
||
"--check_accuracy", "--eval_task", "code_completion",
|
||
f"--hf_model_dir={llm_gpt2_starcoder_model_root}",
|
||
"--no_add_special_tokens", "--max_attention_window_size=4096",
|
||
"--tensorrt_llm_rouge1_threshold=25",
|
||
f"--dataset_dir={llm_datasets_root}",
|
||
f"--rouge_dir={llm_rouge_root}"
|
||
])
|
||
|
||
|
||
@skip_pre_ada
|
||
@pytest.mark.parametrize("minitron_model_root", ["4b"], indirect=True)
|
||
def test_llm_minitron_fp8_with_pseudo_loras(gpt_example_root,
|
||
minitron_model_root,
|
||
llm_datasets_root,
|
||
llm_venv,
|
||
cmodel_dir,
|
||
engine_dir,
|
||
dtype='bfloat16'):
|
||
"Run Minitron model with multiple pseudo LoRAs."
|
||
|
||
# Quantize the base model to fp8.
|
||
print("Quantizing and converting checkpoint...")
|
||
ckpt_dir = f"{cmodel_dir}/minitron/fp8/1-gpu"
|
||
|
||
quantize_cmd = [
|
||
f"{gpt_example_root}/../../../quantization/quantize.py",
|
||
f"--model_dir={minitron_model_root}",
|
||
f"--calib_dataset={llm_datasets_root}/cnn_dailymail",
|
||
f"--dtype={dtype}",
|
||
"--qformat=fp8",
|
||
"--kv_cache_dtype=fp8",
|
||
f"--output_dir={ckpt_dir}",
|
||
]
|
||
venv_check_call(llm_venv, quantize_cmd)
|
||
|
||
test_multi_lora_support(
|
||
hf_model_dir=minitron_model_root,
|
||
tllm_ckpt_dir=ckpt_dir,
|
||
engine_dir=engine_dir,
|
||
llm_venv=llm_venv,
|
||
example_root=gpt_example_root,
|
||
num_loras=2,
|
||
lora_rank=8,
|
||
target_hf_modules=["q_proj", "k_proj", "v_proj"],
|
||
target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
|
||
zero_lora_weights=True,
|
||
)
|
||
|
||
|
||
@pytest.mark.skip_less_device_memory(
|
||
20000) # Conservative 20GB requirement for GPT-OSS-20B
|
||
@pytest.mark.parametrize("gpt_oss_model_root", [
|
||
"gpt-oss-20b",
|
||
], indirect=True)
|
||
@pytest.mark.parametrize("llm_lora_model_root",
|
||
['gpt-oss-20b-lora-adapter_NIM_r8'],
|
||
indirect=True)
|
||
def test_gpt_oss_20b_lora_torch(gpt_example_root, llm_venv, gpt_oss_model_root,
|
||
llm_datasets_root, llm_rouge_root, engine_dir,
|
||
cmodel_dir, llm_lora_model_root):
|
||
"""Run GPT-OSS-20B with LoRA adapter using Torch backend."""
|
||
|
||
print(f"Using LoRA from: {llm_lora_model_root}")
|
||
|
||
defs.ci_profiler.start("test_gpt_oss_20b_lora_torch")
|
||
|
||
lora_config = LoraConfig(
|
||
lora_dir=[llm_lora_model_root],
|
||
max_lora_rank=8, # Match adapter_config.json "r": 8
|
||
max_loras=1,
|
||
max_cpu_loras=1,
|
||
)
|
||
|
||
with LLM(model=gpt_oss_model_root, lora_config=lora_config) as llm:
|
||
|
||
prompts = [
|
||
"User: Message Mason saying that we should compete in next week's football tournament, and tell him that the winner will get $100.\n\nAssistant: "
|
||
]
|
||
|
||
sampling_params = SamplingParams(max_tokens=50)
|
||
|
||
lora_request = [LoRARequest("gpt-oss-lora", 0, llm_lora_model_root)]
|
||
|
||
print("Running inference with real LoRA adapter...")
|
||
outputs = llm.generate(prompts,
|
||
sampling_params,
|
||
lora_request=lora_request)
|
||
|
||
expected_output = " Hey Mason! I hope you're doing well. I was thinking about the next week's football tournament and I wanted to give you a hint that we should compete in it. The winner will be a great opportunity for us to win $100.\n\nUser:"
|
||
|
||
for i, output in enumerate(outputs):
|
||
print(f"Prompt {i+1}: {prompts[i]}")
|
||
print(f"Response {i+1}: {output.outputs[0].text}")
|
||
print("-" * 50)
|
||
|
||
assert len(outputs) == 1
|
||
assert len(outputs[0].outputs) > 0
|
||
generated_text = outputs[0].outputs[0].text
|
||
similarity = similarity_score(generated_text, expected_output)
|
||
assert similar(generated_text, expected_output, threshold=0.8), \
|
||
f"Output similarity too low (similarity={similarity:.2%})!\nExpected: {repr(expected_output)}\nGot: {repr(generated_text)}"
|
||
|
||
defs.ci_profiler.stop("test_gpt_oss_20b_lora_torch")
|
||
print(
|
||
f"test_gpt_oss_20b_lora_torch: {defs.ci_profiler.elapsed_time_in_sec('test_gpt_oss_20b_lora_torch')} sec"
|
||
)
|