import json as _json import os import pathlib as _pl import sys import time as _time import typing as _tp import numpy as _np import torch as _tor from binding_test_utils import * from transformers import AutoTokenizer import tensorrt_llm.bindings as _tb sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from utils.util import getSMVersion @pytest.mark.parametrize("variant, results_file", [ ("fp16-plugin-packed-paged", "output_tokens_fp16_plugin_packed_paged_tp1_pp1.npy"), ]) def test_gpt_manager(variant, results_file, llm_root: _pl.Path, resource_path: _pl.Path, engine_path: _pl.Path, data_path: _pl.Path, llm_model_root): if getSMVersion() < 80: pytest.skip( "ContextFMHAType with fp32 acc is not supported in pre-ampere architecture" ) model_dir = "gpt2" tp_size = 1 pp_size = 1 beam_width = 1 max_batch_size = 8 end_id = 50256 pad_id = 50256 # load input data input_path = data_path / "input_tokens.npy" assert input_path.is_file() given_input = _np.load(input_path).astype("int32") input_shape = given_input.shape assert len(input_shape) == 2 num_given_inputs = input_shape[0] assert max_batch_size <= num_given_inputs max_input_length = input_shape[1] given_input_lengths = sequence_lengths(given_input, pad_id) assert _np.all(given_input_lengths <= max_input_length) # load expected output data results_path = data_path / model_dir / ( "sampling" if beam_width == 1 else f"beam_search_{beam_width}") / results_file if not results_path.exists(): model_cache_arg = ["--model_cache", str(llm_model_root) ] if llm_model_root is not None else [] prepare_model_tests(llm_root, resource_path, "gpt", model_cache_arg) assert results_path.is_file() expected_output = _np.load(results_path) output_shape = expected_output.shape assert len(output_shape) == 2 assert num_given_inputs * beam_width == output_shape[0] max_seq_length = output_shape[1] assert max_input_length <= max_seq_length expected_output_lengths = sequence_lengths(expected_output, end_id) assert _np.all(expected_output_lengths <= max_seq_length) gpu_size_path = f"tp{tp_size}-pp{pp_size}-gpu" model_path = engine_path / model_dir / variant / gpu_size_path assert model_path.is_dir() config_path = model_path / "config.json" config_json = _tb.GptJsonConfig.parse_file(config_path) assert config_json.tensor_parallelism == tp_size assert config_json.pipeline_parallelism == pp_size world_config = _tb.WorldConfig.mpi(tensor_parallelism=tp_size, pipeline_parallelism=pp_size) engine_filename = config_json.engine_filename(world_config) assert (model_path / engine_filename).is_file() config_json.model_config max_new_tokens = max_seq_length - max_input_length inference_request_list = [] for i, (req, length) in enumerate(zip(given_input, given_input_lengths)): ir = _tb.InferenceRequest(i) ir.input_ids = _tor.tensor(req[:length].tolist(), dtype=_tor.int32) ir.max_new_tokens = _tor.tensor([[length + max_new_tokens]], dtype=_tor.int32) ir.end_id = _tor.tensor([end_id], dtype=_tor.int32) ir.pad_id = _tor.tensor([pad_id], dtype=_tor.int32) ir.beam_width = _tor.tensor([beam_width], dtype=_tor.int32) ir.temperature = _tor.tensor([1.0], dtype=_tor.float32) ir.min_length = _tor.tensor([1], dtype=_tor.int32) ir.random_seed = _tor.tensor([42], dtype=_tor.int64) ir.runtime_top_k = _tor.tensor([0], dtype=_tor.int32) ir.runtime_top_p = _tor.tensor([0.0], dtype=_tor.float32) inference_request_list.append(ir) def logits_post_processor(req_id: int, logits: _tor.Tensor, ids: _tp.List[_tp.List[int]], stream: _tor.Stream): del req_id, ids cuda_stream = _tor.cuda.Stream( stream_id=stream.stream_id, device_index=stream.device_index, device_type=1, # == kCUDA ) with _tor.cuda.stream(cuda_stream): logits[:] = float("-inf") logits[..., 42] = 0 return logits ir = _tb.InferenceRequest(42, logits_post_processor) ir.input_ids = _tor.tensor(given_input[0].tolist(), dtype=_tor.int32) ir.max_new_tokens = _tor.tensor([[8]], dtype=_tor.int32) ir.end_id = _tor.tensor([end_id], dtype=_tor.int32) ir.pad_id = _tor.tensor([pad_id], dtype=_tor.int32) ir.beam_width = _tor.tensor([beam_width], dtype=_tor.int32) ir.temperature = _tor.tensor([1.0], dtype=_tor.float32) ir.min_length = _tor.tensor([1], dtype=_tor.int32) ir.random_seed = _tor.tensor([42], dtype=_tor.int64) ir.runtime_top_k = _tor.tensor([0], dtype=_tor.int32) ir.runtime_top_p = _tor.tensor([0.0], dtype=_tor.float32) inference_request_list.append(ir) def fetch_requests(max_num_sequences: int): nonlocal inference_request_list fetched = [] for _ in range(max_num_sequences): try: fetched.append(inference_request_list.pop()) except IndexError: break return fetched def response_cb(req_id: int, tensors: _tp.List[_tb.NamedTensor], is_ok: bool, err_msg: str): nonlocal remaining_requests remaining_requests -= 1 assert is_ok assert not err_msg tensor_dict = {item.name: item.tensor for item in tensors} batch_idx = req_id observed_output = tensor_dict[_tb.tensor_names.OUTPUT_IDS] assert observed_output is not None if req_id == 42: outputs = observed_output[..., len(given_input[0]):] assert _tor.allclose(outputs, _tor.tensor([42], dtype=_tor.int32)) return expected_length = expected_output_lengths[batch_idx] observed_length = tensor_dict[_tb.tensor_names.SEQUENCE_LENGTH].item( ) - given_input_lengths[batch_idx] assert expected_length == observed_length, (batch_idx, expected_length, observed_length) expected = expected_output[batch_idx, :expected_length] observed = observed_output[0, 0, :expected_length].numpy() unmatched = expected != observed if _np.any(unmatched): assert False, (batch_idx, _np.where(unmatched), _np.column_stack((expected, observed))[unmatched]) def should_stop(): return set() def stats_cb(stats_json: str): assert _json.loads(stats_json) opt_params = _tb.TrtGptModelOptionalParams() memory_counters = _tb.MemoryCounters.instance() init_gpu_mem = memory_counters.gpu for _ in range(3): remaining_requests = len(inference_request_list) with _tb.GptManager(model_path, _tb.TrtGptModelType.InflightBatching, 1, _tb.SchedulerPolicy.MAX_UTILIZATION, fetch_requests, response_cb, should_stop, stats_cb, opt_params, 10000) as manager: while remaining_requests > 0: _time.sleep(0.1) assert manager is not None assert memory_counters.gpu > init_gpu_mem assert memory_counters.gpu == init_gpu_mem @pytest.mark.parametrize("variant, results_file", [ ("fp16-plugin-packed-paged", "output_tokens_fp16_plugin_packed_paged_tp1_pp1.npy"), ]) def test_gpt_manager_constrained_generation( variant, results_file, llm_root: _pl.Path, resource_path: _pl.Path, engine_path: _pl.Path, data_path: _pl.Path, llm_model_root): try: from lmformatenforcer import (JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData) from pydantic import BaseModel except ImportError: pytest.skip("Cannot import lmformatenforcer, skipping test") def _build_regular_tokens_list( tokenizer) -> _tp.List[_tp.Tuple[int, str, bool]]: token_0 = [tokenizer.encode("0")[-1]] regular_tokens = [] vocab_size = tokenizer.vocab_size for token_idx in range(vocab_size): if token_idx in tokenizer.all_special_ids: continue # We prepend token 0 and skip the first letter of the result to get a space if the token is a start word. tensor_after_0 = _tor.tensor(token_0 + [token_idx], dtype=_tor.long) decoded_after_0 = tokenizer.decode(tensor_after_0)[1:] decoded_regular = tokenizer.decode(token_0) is_word_start_token = len(decoded_after_0) > len(decoded_regular) regular_tokens.append( (token_idx, decoded_after_0, is_word_start_token)) return regular_tokens def build_token_enforcer(tokenizer, character_level_parser): """ Build logits processor for feeding it into generate function (use_py_session should be True) """ regular_tokens = _build_regular_tokens_list(tokenizer) def _decode(tokens: _tp.List[int]) -> str: tensor = _tor.tensor(tokens, dtype=_tor.long) return tokenizer.decode(tensor) tokenizer_data = TokenEnforcerTokenizerData(regular_tokens, _decode, tokenizer.eos_token_id) return TokenEnforcer(tokenizer_data, character_level_parser) tp_size = 1 pp_size = 1 model_dir = "gpt2" gpu_size_path = f"tp{tp_size}-pp{pp_size}-gpu" model_path = engine_path / model_dir / variant / gpu_size_path tokenizer = AutoTokenizer.from_pretrained(model_dir) input = "Please give me information about Michael Jordan. You MUST answer using the following json schema: " prompt = tokenizer.encode(input) class AnswerFormat(BaseModel): last_name: str year_of_birth: int parser = JsonSchemaParser(AnswerFormat.model_json_schema()) token_enforcer = build_token_enforcer(tokenizer, parser) def fetch_requests(max_num_sequences: int): nonlocal inference_request_list fetched = [] for _ in range(max_num_sequences): try: fetched.append(inference_request_list.pop()) except IndexError: break return fetched def logits_post_processor(req_id: int, logits: _tor.Tensor, ids: _tp.List[_tp.List[int]], stream: _tor.Stream): del req_id cuda_stream = _tor.cuda.Stream( stream_id=stream.stream_id, device_index=stream.device_index, device_type=1, # == kCUDA ) def _trim(ids): return [x for x in ids if x != tokenizer.eos_token_id] allowed = token_enforcer.get_allowed_tokens(_trim(ids[0])) mask = _tor.full_like(logits, fill_value=float("-inf"), device="cpu") mask[:, :, allowed] = 0 mask = mask.to(logits.device) with _tor.cuda.stream(cuda_stream): logits += mask return logits result = "" def response_cb(req_id: int, tensors: _tp.List[_tb.NamedTensor], is_finished: bool, err_msg: str): nonlocal remaining_requests, result assert not err_msg assert is_finished tensors_dict = {t.name: t.tensor for t in tensors} result = tokenizer.decode(tensors_dict["output_ids"].squeeze().tolist()) remaining_requests -= 1 def should_stop(): return set() def stats_cb(stats_json: str): assert _json.loads(stats_json) inference_request_list = [] ir = _tb.InferenceRequest(42, logits_post_processor) ir.input_ids = _tor.tensor(prompt, dtype=_tor.int32) ir.max_new_tokens = _tor.tensor([[64]], dtype=_tor.int32) ir.end_id = _tor.tensor([tokenizer.eos_token_id], dtype=_tor.int32) inference_request_list.append(ir) remaining_requests = len(inference_request_list) opt_params = _tb.TrtGptModelOptionalParams() with _tb.GptManager(model_path, _tb.TrtGptModelType.InflightBatching, 1, _tb.SchedulerPolicy.MAX_UTILIZATION, fetch_requests, response_cb, should_stop, stats_cb, opt_params, 10000): while remaining_requests > 0: _time.sleep(0.1) assert result == input + ' { "last_name": "Michael Jordan", "year_of_birth": 18 } '