TensorRT-LLMs/tests/hlapi/test_executor.py
石晓伟 850b6fa1e7
Update TensorRT-LLM (#1358)
Co-authored-by: Kaiyu <26294424+kaiyux@users.noreply.github.com>
2024-03-26 20:47:14 +08:00

206 lines
7.2 KiB
Python

import os
import sys
import unittest
from pathlib import Path
from typing import Optional
import pytest
import torch
from mpi4py import MPI
from transformers import AutoTokenizer
from tensorrt_llm.executor import (GenerationExecutor, GenerationExecutorWorker,
GenerationRequest, SamplingConfig)
from tensorrt_llm.hlapi.llm import LLM, ModelConfig
WORLD_SIZE = MPI.COMM_WORLD.size
@pytest.fixture(scope="module")
def llm_root() -> Path:
environ_root = os.environ.get("LLM_ROOT", None)
return Path(environ_root) if environ_root is not None else Path(
__file__).parent.parent.parent
@pytest.fixture(scope="module")
def llm_model_root() -> Optional[Path]:
if "LLM_MODEL_ROOT" in os.environ:
return Path(os.environ["LLM_MODEL_ROOT"])
sys.path.append(str(Path(__file__).resolve().parent.parent))
from utils.llm_data import llm_models_root
return llm_models_root()
@pytest.fixture(scope="module")
def resource_path(llm_root: Path) -> Path:
return llm_root / "cpp" / "tests" / "resources"
@pytest.fixture(scope="module")
def engine_path(resource_path: Path) -> Path:
return resource_path / "models" / "rt_engine"
@pytest.fixture(scope="module")
def llama_7b_path(engine_path: Path, llm_model_root: Path) -> Path:
path = engine_path / "llama7b"
if not path.exists():
config = ModelConfig(str(llm_model_root / "llama-models/llama-7b-hf"))
llm = LLM(config)
llm.save(str(path))
return path
@pytest.fixture(scope="module")
def llama_7b_bs2_path(engine_path: Path, llm_model_root: Path) -> Path:
path = engine_path / "llama7b_bs2"
if not path.exists():
config = ModelConfig(str(llm_model_root / "llama-models/llama-7b-hf"),
max_beam_width=2)
llm = LLM(config)
llm.save(str(path))
return path
@pytest.fixture(scope="module")
def llama_7b_tp2_path(engine_path: Path, llm_model_root: Path) -> Path:
path = engine_path / "llama7b-tp2"
if not path.exists():
config = ModelConfig(str(llm_model_root / "llama-models/llama-7b-hf"))
config.parallel_config.tp_size = 2
llm = LLM(config)
llm.save(str(path))
return path
@pytest.mark.skipif(WORLD_SIZE != 1, reason="Must run on single MPI rank")
def test_generation_bs2(llama_7b_bs2_path: Path):
tokenizer = llama_7b_bs2_path
prompt = "A B C D"
max_new_tokens = 4
with GenerationExecutorWorker(llama_7b_bs2_path,
tokenizer,
max_beam_width=2) as executor:
result = executor.generate(prompt,
sampling_config=SamplingConfig(
max_new_tokens=max_new_tokens,
beam_width=2))
assert result.text[0] == "<s> A B C D E F G H"
assert result.text[1] == "<s> A B C D E F G I"
@pytest.mark.skipif(WORLD_SIZE != 1, reason="Must run on single MPI rank")
def test_sync_generation(llama_7b_path: Path):
tokenizer = llama_7b_path
prompt = "A B C D"
expected_output = " E F G H"
expected_long_output = " E F G H I J K L"
split_output = ["E", " F", " G", " H", " I", " J", " K", " L"]
sampling_config0 = SamplingConfig(max_new_tokens=4)
sampling_config1 = SamplingConfig(max_new_tokens=8)
with GenerationExecutorWorker(llama_7b_path, tokenizer) as executor:
# Simple generations (synchronous)
result = executor.generate(prompt, sampling_config=sampling_config0)
assert result.text == "<s> " + prompt + expected_output
results = executor.generate(
[prompt, prompt],
sampling_config=[sampling_config0, sampling_config1])
for result, expected in zip(results,
(expected_output, expected_long_output)):
assert result.text == "<s> " + prompt + expected
# Simple generations (asynchronous)
#
# Iterate the partial results when streaming
future = executor.generate_async(prompt,
streaming=True,
sampling_config=sampling_config0)
for idx, partial_result in enumerate(future):
assert partial_result.text_diff == split_output[idx]
# Iterate the partial results when streaming
# Streaming results in nested loop
futures = executor.generate_async(
[prompt, prompt],
streaming=True,
sampling_config=[sampling_config0, sampling_config1])
for future in futures:
for idx, partial_result in enumerate(future):
assert partial_result.text_diff == split_output[idx]
# Low-level api with .submit
# Submit a batch of requests
tokenizer = AutoTokenizer.from_pretrained("gpt2")
futures = []
for _ in range(5):
futures.append(
executor.submit(
GenerationRequest(
prompt,
tokenizer=AutoTokenizer.from_pretrained(llama_7b_path),
sampling_config=sampling_config0)))
for future in executor.wait_first_completed(futures):
assert future.done
assert future.result().text == "".join(split_output[:4])
@pytest.mark.skipif(torch.cuda.device_count() < 2 or WORLD_SIZE != 2,
reason="Must run on 2 MPI ranks with at least 2 GPUs")
def test_sync_generation_tp_all_nodes(llama_7b_tp2_path: Path):
prompt = "deep learning"
sampling_config = SamplingConfig(max_new_tokens=4)
# Normal execution, all nodes live
executor = GenerationExecutorWorker(llama_7b_tp2_path, llama_7b_tp2_path)
result = executor.generate(prompt, sampling_config=sampling_config)
assert result.text == "<s> deep learning, neural network,"
executor.shutdown()
@pytest.mark.skipif(torch.cuda.device_count() < 2 or WORLD_SIZE != 2,
reason="Must run on 2 MPI ranks with at least 2 GPUs")
def test_sync_generation_tp_main_node_only(llama_7b_tp2_path: Path):
prompt = "deep learning"
sampling_config = SamplingConfig(max_new_tokens=4)
with GenerationExecutorWorker(llama_7b_tp2_path,
llama_7b_tp2_path) as executor:
executor.block_subordinates()
# from now on, only rank0 lives in the with statement
# other nodes wait at the "end" of the with statement
result = executor.generate(prompt, sampling_config=sampling_config)
assert result.text == "<s> deep learning, neural network,"
@pytest.mark.skipif(torch.cuda.device_count() < 2 or WORLD_SIZE != 1,
reason="Must run on 1 MPI rank with at least 2 GPUs")
def test_sync_generation_tp_inner(llama_7b_tp2_path: Path):
prompt = "deep learning"
tp_size = 2
sampling_config = SamplingConfig(max_new_tokens=4)
executor = GenerationExecutor.create(llama_7b_tp2_path,
llama_7b_tp2_path,
model_world_size=tp_size)
result = executor.generate(prompt, sampling_config=sampling_config)
assert result.text == "<s> deep learning, neural network,"
executor.shutdown()
if __name__ == "__main__":
unittest.main()