mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
285 lines
11 KiB
Python
285 lines
11 KiB
Python
import os
|
|
from pathlib import Path
|
|
|
|
import defs.ci_profiler
|
|
import pytest
|
|
from defs.common import (convert_weights, test_llm_torch_multi_lora_support,
|
|
venv_check_call, venv_mpi_check_call)
|
|
from defs.conftest import get_device_memory, get_sm_version
|
|
from defs.trt_test_alternative import check_call
|
|
|
|
from tensorrt_llm import LLM
|
|
from tensorrt_llm.executor.request import LoRARequest
|
|
from tensorrt_llm.lora_manager import LoraConfig
|
|
from tensorrt_llm.sampling_params import SamplingParams
|
|
|
|
# skip trt flow cases on post-Blackwell-Ultra
|
|
if get_sm_version() >= 103:
|
|
pytest.skip(
|
|
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
|
|
allow_module_level=True)
|
|
|
|
ROUGE1_ACCURACY_THRESHOLD = 20
|
|
|
|
|
|
@pytest.mark.parametrize("nemotron_nas_model_root", [
|
|
"DeciLM-7B",
|
|
],
|
|
indirect=True)
|
|
def test_nemotron_nas_summary_1gpu(nemotron_nas_example_root, llm_venv,
|
|
nemotron_nas_model_root, llm_datasets_root,
|
|
llm_rouge_root, engine_dir, cmodel_dir):
|
|
model_name = Path(nemotron_nas_model_root).name
|
|
if "51B" in model_name and get_device_memory() < 80000:
|
|
pytest.skip("device memory is insufficient.")
|
|
|
|
print(f"Model name: {model_name}")
|
|
dtype = 'float16'
|
|
ckpt_type = "hf"
|
|
|
|
print("Converting checkpoint...")
|
|
model_dir = convert_weights(llm_venv=llm_venv,
|
|
example_root=nemotron_nas_example_root,
|
|
cmodel_dir=cmodel_dir,
|
|
model=model_name,
|
|
model_path=nemotron_nas_model_root,
|
|
data_type=dtype,
|
|
ckpt_type=ckpt_type,
|
|
gpus=1,
|
|
tp_size=1,
|
|
trust_remote_code=True)
|
|
|
|
print("Building engines...")
|
|
build_cmd = [
|
|
"trtllm-build", f"--checkpoint_dir={model_dir}",
|
|
f"--output_dir={engine_dir}", f"--max_batch_size={4}",
|
|
f"--max_input_len={2048}", "--kv_cache_type=paged",
|
|
"--remove_input_padding=enable", "--gemm_plugin=auto",
|
|
"--gpt_attention_plugin=auto"
|
|
]
|
|
|
|
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
|
|
|
print("Running inference...")
|
|
|
|
summary_cmd = [
|
|
f"{nemotron_nas_example_root}/../../../summarize.py",
|
|
f"--engine_dir={engine_dir}", "--test_hf", "--hf_device_map_auto",
|
|
"--batch_size=1", "--test_trt_llm",
|
|
f"--hf_model_dir={nemotron_nas_model_root}", "--check_accuracy",
|
|
f"--tensorrt_llm_rouge1_threshold={ROUGE1_ACCURACY_THRESHOLD}",
|
|
"--no_add_special_tokens", f"--dataset_dir={llm_datasets_root}",
|
|
f"--rouge_dir={llm_rouge_root}"
|
|
]
|
|
|
|
venv_check_call(llm_venv, summary_cmd)
|
|
|
|
|
|
@pytest.mark.skip_less_device(2)
|
|
@pytest.mark.parametrize("nemotron_nas_model_root", [
|
|
"DeciLM-7B",
|
|
"Llama-3_1-Nemotron-51B-Instruct",
|
|
],
|
|
indirect=True)
|
|
def test_nemotron_nas_summary_2gpu(nemotron_nas_example_root, llm_venv,
|
|
nemotron_nas_model_root, llm_datasets_root,
|
|
llm_rouge_root, engine_dir, cmodel_dir):
|
|
model_name = Path(nemotron_nas_model_root).name
|
|
if "51B" in model_name and get_device_memory() < 80000:
|
|
pytest.skip("device memory is insufficient.")
|
|
|
|
print(f"Model name: {model_name}")
|
|
dtype = 'float16'
|
|
ckpt_type = "hf"
|
|
|
|
print("Converting checkpoint...")
|
|
model_dir = convert_weights(llm_venv=llm_venv,
|
|
example_root=nemotron_nas_example_root,
|
|
cmodel_dir=cmodel_dir,
|
|
model=model_name,
|
|
model_path=nemotron_nas_model_root,
|
|
data_type=dtype,
|
|
ckpt_type=ckpt_type,
|
|
gpus=2,
|
|
tp_size=2,
|
|
trust_remote_code=True)
|
|
|
|
print("Building engines...")
|
|
build_cmd = [
|
|
"trtllm-build", f"--checkpoint_dir={model_dir}",
|
|
f"--output_dir={engine_dir}", f"--max_batch_size={4}",
|
|
f"--max_input_len={2048}", "--kv_cache_type=paged",
|
|
"--remove_input_padding=enable", "--gemm_plugin=auto",
|
|
"--gpt_attention_plugin=auto"
|
|
]
|
|
|
|
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
|
|
|
|
print("Running inference...")
|
|
|
|
mpi_cmd = ["mpirun", "-n", "2", "--allow-run-as-root"]
|
|
|
|
summary_cmd = [
|
|
f"{nemotron_nas_example_root}/../../../summarize.py",
|
|
f"--engine_dir={engine_dir}", "--test_hf", "--hf_device_map_auto",
|
|
"--batch_size=1", "--test_trt_llm",
|
|
f"--hf_model_dir={nemotron_nas_model_root}", "--check_accuracy",
|
|
f"--tensorrt_llm_rouge1_threshold={ROUGE1_ACCURACY_THRESHOLD}",
|
|
"--no_add_special_tokens", f"--dataset_dir={llm_datasets_root}",
|
|
f"--rouge_dir={llm_rouge_root}"
|
|
]
|
|
|
|
venv_mpi_check_call(llm_venv, mpi_cmd, summary_cmd)
|
|
|
|
|
|
@pytest.mark.skip_less_device_memory(80000)
|
|
@pytest.mark.parametrize("nemotron_nas_model_root", [
|
|
"Llama-3.1-Nemotron-Nano-8B-v1",
|
|
],
|
|
indirect=True)
|
|
def test_nemotron_nano_8b_lora_torch(nemotron_nas_example_root, llm_venv,
|
|
nemotron_nas_model_root, llm_datasets_root,
|
|
llm_rouge_root, engine_dir, cmodel_dir):
|
|
"""Run Nemotron Nano 8B with multiple dummy LoRAs using LLM-API Torch backend."""
|
|
|
|
expected_outputs = {
|
|
'llama-3.1-nemotron-nano-8b-v1': [
|
|
" I am having a bit of a problem with my computer. The screen is black, but my monitor is still giving me the same signals. The brightness",
|
|
" How is the climate like? What are some of the typical foods and drinks of the region? What is the economy like? How does the city compare",
|
|
" I have heard that it's possible but can be dangerous. What are the potential risks? Are there any safety guidelines? I should probably check some references",
|
|
" I can't do that right now. But I can suggest that if you're interested in music trends, you can check Spotify's \"Discover Weekly\"",
|
|
" The capital of France is Paris. But wait, I think there's another city called Paris. No, no, that's the same city. Maybe"
|
|
],
|
|
}
|
|
|
|
print("Testing with LLM-API Torch backend...")
|
|
|
|
defs.ci_profiler.start("test_llm_torch_multi_lora_support")
|
|
|
|
model_name = os.path.basename(nemotron_nas_model_root).lower()
|
|
test_llm_torch_multi_lora_support(
|
|
hf_model_dir=nemotron_nas_model_root,
|
|
llm_venv=llm_venv,
|
|
num_loras=2,
|
|
lora_rank=8,
|
|
target_hf_modules=["q_proj", "k_proj", "v_proj"],
|
|
zero_lora_weights=True,
|
|
tensor_parallel_size=1,
|
|
expected_outputs=expected_outputs[model_name])
|
|
defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
|
|
print(
|
|
f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"
|
|
)
|
|
|
|
|
|
@pytest.mark.skip_less_device(4)
|
|
@pytest.mark.skip_less_device_memory(80000)
|
|
@pytest.mark.parametrize("nemotron_nas_model_root", [
|
|
"Llama-3_3-Nemotron-Super-49B-v1",
|
|
],
|
|
indirect=True)
|
|
@pytest.mark.parametrize(
|
|
"llm_lora_model_root",
|
|
['Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_NIM_r32'],
|
|
indirect=True)
|
|
def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv,
|
|
nemotron_nas_model_root,
|
|
llm_lora_model_root,
|
|
llm_datasets_root, llm_rouge_root,
|
|
engine_dir, cmodel_dir):
|
|
"""Run Nemotron Super 49B with real LoRA adapters using LLM-API Torch backend."""
|
|
|
|
print("Testing Nemotron Super 49B with real LoRA adapters...")
|
|
|
|
print(f"Using real LoRA from: {llm_lora_model_root}")
|
|
|
|
defs.ci_profiler.start("test_nemotron_real_lora_torch")
|
|
|
|
lora_config = LoraConfig(
|
|
lora_dir=[llm_lora_model_root],
|
|
max_lora_rank=32, # From adapter_config.json: "r": 32
|
|
max_loras=1,
|
|
max_cpu_loras=1,
|
|
)
|
|
|
|
with LLM(model=nemotron_nas_model_root,
|
|
lora_config=lora_config,
|
|
tensor_parallel_size=4,
|
|
dtype="bfloat16",
|
|
max_batch_size=2,
|
|
max_input_len=512,
|
|
max_seq_len=1024,
|
|
max_beam_width=1,
|
|
load_format="dummy") as llm:
|
|
|
|
prompts = [
|
|
"What is the capital of France?",
|
|
"Explain quantum computing in simple terms."
|
|
]
|
|
|
|
sampling_params = SamplingParams(max_tokens=50,
|
|
temperature=0.7,
|
|
top_p=0.9)
|
|
|
|
lora_request = [
|
|
LoRARequest("nemotron-lora", 0, llm_lora_model_root),
|
|
LoRARequest("nemotron-lora", 1, llm_lora_model_root)
|
|
]
|
|
|
|
print("Running inference with real LoRA adapter...")
|
|
outputs = llm.generate(prompts,
|
|
sampling_params,
|
|
lora_request=lora_request)
|
|
|
|
for i, output in enumerate(outputs):
|
|
print(f"Prompt {i+1}: {prompts[i]}")
|
|
print(f"Response {i+1}: {output.outputs[0].text}")
|
|
print("-" * 50)
|
|
|
|
assert len(outputs) == 2
|
|
assert len(outputs[0].outputs) > 0
|
|
assert len(outputs[1].outputs) > 0
|
|
assert len(outputs[0].outputs[0].text) > 0
|
|
assert len(outputs[1].outputs[0].text) > 0
|
|
|
|
defs.ci_profiler.stop("test_nemotron_real_lora_torch")
|
|
print(
|
|
f"test_nemotron_real_lora_torch: {defs.ci_profiler.elapsed_time_in_sec('test_nemotron_real_lora_torch')} sec"
|
|
)
|
|
|
|
|
|
@pytest.mark.skip(reason="TODO: Test OOMs on 8 GPUs - to fix")
|
|
@pytest.mark.skip_less_device(8)
|
|
@pytest.mark.skip_less_device_memory(80000)
|
|
@pytest.mark.parametrize("nemotron_nas_model_root", [
|
|
"Llama-3_1-Nemotron-Ultra-253B-v1",
|
|
],
|
|
indirect=True)
|
|
def test_nemotron_ultra_253b_lora_torch(nemotron_nas_example_root, llm_venv,
|
|
nemotron_nas_model_root,
|
|
llm_datasets_root, llm_rouge_root,
|
|
engine_dir, cmodel_dir):
|
|
"""Run Nemotron Ultra 253B with multiple dummy LoRAs using LLM-API Torch backend."""
|
|
|
|
expected_outputs = {
|
|
'Llama-3_1-Nemotron-Ultra-253B-v1': ["...", "...", "...", "...", "..."],
|
|
}
|
|
|
|
print("Testing with LLM-API Torch backend...")
|
|
|
|
defs.ci_profiler.start("test_llm_torch_multi_lora_support")
|
|
model_name = os.path.basename(nemotron_nas_model_root).lower()
|
|
test_llm_torch_multi_lora_support(
|
|
hf_model_dir=nemotron_nas_model_root,
|
|
llm_venv=llm_venv,
|
|
num_loras=2,
|
|
lora_rank=8,
|
|
target_hf_modules=["q_proj", "k_proj", "v_proj"],
|
|
zero_lora_weights=True,
|
|
tensor_parallel_size=8,
|
|
expected_outputs=expected_outputs[model_name])
|
|
defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
|
|
print(
|
|
f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"
|
|
)
|