import asyncio import json import os import random import shutil import sys import tempfile from typing import List, Optional, Union import datasets import pytest import torch import transformers from tensorrt_llm._utils import release_gc from tensorrt_llm.bindings import executor as tllm from tensorrt_llm.executor import (ExecutorBindingsWorker, LoRARequest, PromptAdapterRequest, RequestError) from tensorrt_llm.llmapi import (LLM, BuildCacheConfig, KvCacheConfig, NoStatsAvailable, SamplingParams) from tensorrt_llm.llmapi.llm_utils import BuildConfig, _ParallelConfig from tensorrt_llm.llmapi.tokenizer import TokenizerBase, TransformersTokenizer from tensorrt_llm.llmapi.utils import get_total_gpu_memory from tensorrt_llm.lora_manager import LoraConfig from tensorrt_llm.models import PretrainedConfig from tensorrt_llm.models.automodel import AutoConfig, AutoModelForCausalLM sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from utils.llm_data import llm_models_root from utils.util import force_ampere, similar, skip_less_than_40gb_memory # The unittests are based on the tiny-llama, which is fast to build and run. # There are other tests based on llama-7B model, such as the end-to-end tests in test_e2e.py, and parallel tests in # test_llm_multi_gpu.py. def get_model_path(model_name): engine_dir = os.environ.get('LLM_ENGINE_DIR', None) if engine_dir: return engine_dir return str(llm_models_root() / model_name) def llm_test_harness(model_dir: str, inputs: List[str], references: List[str], *, sampling_params: Optional[SamplingParams] = None, similar_threshold: float = 0.8, **llm_kwargs): tp_size = llm_kwargs.get('tensor_parallel_size', 1) pp_size = llm_kwargs.get('pipeline_parallel_size', 1) world_size = tp_size * pp_size if world_size > torch.cuda.device_count(): pytest.skip( f"world_size ({world_size}) is greater than available GPUs ({torch.cuda.device_count()})" ) tokenizer = llm_kwargs.pop('tokenizer', None) if tokenizer is None: tokenizer = model_dir llm = LLM(model_dir, tokenizer=tokenizer, **llm_kwargs) outputs = llm.generate(inputs, sampling_params=sampling_params) print(outputs) for out, ref in zip(outputs, references): if isinstance(ref, list): # N output assert len(out.outputs) == len(ref) for o, r in zip(out.outputs, ref): assert similar(o.text, r, threshold=similar_threshold) else: assert similar(out.outputs[0].text, ref, threshold=similar_threshold) del llm release_gc() def llm_check_output(llm: LLM, inputs: List[str], references: List[str], *, sampling_params: Optional[SamplingParams] = None, similar_threshold: float = 0.8, finish_reasons: Optional[List[str]] = None, stop_reasons: Optional[List[Union[int, str]]] = None, **gen_kwargs): outputs = llm.generate(inputs, sampling_params=sampling_params, **gen_kwargs) assert len(outputs) == len(references) for i, (output, target_output) in enumerate(zip(outputs, references)): if isinstance(target_output, list): # N output assert len(output.outputs) == len(target_output) for j, (out, ref) in enumerate(zip(output.outputs, target_output)): assert similar(out.text, ref, threshold=similar_threshold) if finish_reasons is not None: assert out.finish_reason == finish_reasons[i][j] if stop_reasons is not None: assert out.stop_reason == stop_reasons[i][j] else: out = output.outputs[0] assert similar(out.text, target_output, threshold=similar_threshold) if finish_reasons is not None: assert out.finish_reason == finish_reasons[i] if stop_reasons is not None: assert out.stop_reason == stop_reasons[i] default_model_name = "llama-models-v2/TinyLlama-1.1B-Chat-v1.0" mixtral_model_name = "Mixtral-8x7B-v0.1" llama_model_path = get_model_path(default_model_name) llm_engine_dir = os.environ.get('LLM_ENGINE_DIR', './tmp.engine') cnn_dailymail_path = str(llm_models_root() / "datasets" / "cnn_dailymail") alpaca_chinese_path = str(llm_models_root() / "datasets" / "silk-road" / "alpaca-data-gpt4-chinese") prompts = ["A B C"] global_kvcache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) @force_ampere def test_llm_build_config(): build_config = BuildConfig() # change some building parameters build_config.max_batch_size = 129 build_config.max_beam_width = 4 build_config.max_num_tokens = 888 build_config.strongly_typed = True build_config.max_seq_len = 333 llm = LLM(model=llama_model_path, build_config=build_config, kv_cache_config=global_kvcache_config) tmpdir = tempfile.TemporaryDirectory() llm.save(tmpdir.name) with open(os.path.join(tmpdir.name, "config.json"), "r") as f: # read the build_config and check if the parameters are correctly saved engine_config = json.load(f) build_config1 = BuildConfig.from_dict(engine_config["build_config"]) # Know issue: this will be converted to None after save engine for single-gpu build_config1.plugin_config.nccl_plugin = 'float16' assert build_config1.max_batch_size == build_config.max_batch_size assert build_config1.max_beam_width == build_config.max_beam_width assert build_config1.max_num_tokens == build_config.max_num_tokens assert build_config1.strongly_typed == build_config.strongly_typed assert build_config1.max_seq_len == build_config.max_seq_len def test_llm_loading_from_hf(): sampling_params = SamplingParams(max_tokens=8) llm_test_harness(llama_model_path, prompts, ["D E F G H I J K"], sampling_params=sampling_params, kv_cache_config=global_kvcache_config) @force_ampere def test_llm_loading_from_ckpt(): tokenizer = TransformersTokenizer.from_pretrained(llama_model_path) assert tokenizer is not None ckpt_dir = tempfile.TemporaryDirectory() llama = AutoModelForCausalLM.from_hugging_face(llama_model_path) llama.save_checkpoint(ckpt_dir.name) del llama llm_test_harness(ckpt_dir.name, prompts, ["D E F G H I J K"], tokenizer=tokenizer, kv_cache_config=global_kvcache_config, sampling_params=SamplingParams(max_tokens=8)) @pytest.mark.parametrize('model_format', ['hf', 'ckpt']) def test_llm_with_dummy_weights(model_format): # dummy_dir contains config.json and tokenizer files only # the test fails if load_format != 'dummy' dummy_dir = tempfile.TemporaryDirectory() if model_format == 'hf': hf_config = transformers.AutoConfig.from_pretrained(llama_model_path) hf_config.save_pretrained(dummy_dir.name) else: config = AutoConfig.from_hugging_face(llama_model_path, dtype='float16') config.to_json_file(os.path.join(dummy_dir.name, 'config.json')) tokenizer = transformers.AutoTokenizer.from_pretrained(llama_model_path) tokenizer.save_pretrained(dummy_dir.name) sampling_params = SamplingParams(max_tokens=8) llm_test_harness(dummy_dir.name, prompts, ["A placeholder reference for dummy-weight engine."], sampling_params=sampling_params, similar_threshold=0.0, load_format='dummy', kv_cache_config=global_kvcache_config) class MyTokenizer(TokenizerBase): ''' A wrapper for the Transformers' tokenizer. This is the default tokenizer for LLM. ''' @classmethod def from_pretrained(cls, pretrained_model_dir: str, **kwargs): tokenizer = transformers.AutoTokenizer.from_pretrained( pretrained_model_dir, **kwargs) return MyTokenizer(tokenizer) def __init__(self, tokenizer): self.tokenizer = tokenizer @property def eos_token_id(self) -> int: return self.tokenizer.eos_token_id @property def pad_token_id(self) -> int: return self.tokenizer.pad_token_id def encode(self, text: str, **kwargs) -> List[int]: return self.tokenizer.encode(text, **kwargs) def decode(self, token_ids: List[int], **kwargs) -> str: return self.tokenizer.decode(token_ids, **kwargs) def batch_encode_plus(self, texts: List[str], **kwargs) -> dict: return self.tokenizer.batch_encode_plus(texts, **kwargs) def test_llm_with_customized_tokenizer(): llm = LLM( model=llama_model_path, # a customized tokenizer is passed to override the default one tokenizer=MyTokenizer.from_pretrained(llama_model_path), kv_cache_config=global_kvcache_config) for output in llm.generate(prompts): print(output) def test_llm_without_tokenizer(): llm = LLM(model=llama_model_path, skip_tokenizer_init=True, kv_cache_config=global_kvcache_config) sampling_params = SamplingParams(end_id=2, pad_id=2, max_tokens=8) prompts = [[23, 14, 3]] for output in llm.generate(prompts, sampling_params=sampling_params): assert not output.outputs[0].text, \ "The output should be empty since the tokenizer is missing" print(output) @pytest.mark.parametrize( 'tokenizer_dir, threshold', [ (get_model_path('gpt2'), 0.95), # BPE (get_model_path('bert/bert-base-uncased'), 0.95), # WordPiece (get_model_path('t5-small'), 0.95), # SentencePiece (get_model_path('opt-125m'), 0.95), (get_model_path('starcoder2-3b'), 0.95), (get_model_path('gpt-j-6b'), 0.95), (get_model_path('bloom-560m'), 0.95), (get_model_path('mpt-7b'), 0.95), (get_model_path('falcon-7b-instruct'), 0.95), (get_model_path('llama-models-v2/llama-v2-7b-hf'), 0.95), (get_model_path('codellama/CodeLlama-7b-Instruct-hf'), 0.95), (llama_model_path, 0.95), (get_model_path(mixtral_model_name), 0.95) ]) def test_tokenizer_decode_incrementally(tokenizer_dir: str, threshold: float): random.seed(42) num_samples = 100 cnn_dailymail = datasets.load_dataset(cnn_dailymail_path, name='3.0.0', split='train') alpaca_chinese = datasets.load_dataset(alpaca_chinese_path, split='train') dataset = cnn_dailymail['article'][:num_samples // 2] + alpaca_chinese[ 'output_zh'][:num_samples // 2] tokenizer = TransformersTokenizer.from_pretrained(tokenizer_dir, legacy=False, padding_side='left', truncation_side='left', trust_remote_code=True, use_fast=True) num_perfect = 0 for text in dataset: token_ids = tokenizer.encode(text, add_special_tokens=False) seq_len = len(token_ids) prompt_len = random.randint(1, seq_len // 2) decoded_text, states = tokenizer.decode_incrementally( token_ids[:prompt_len]) for i in range(prompt_len, len(token_ids)): decoded_text, states = tokenizer.decode_incrementally( [token_ids[i]], decoded_text, states) if tokenizer_dir.endswith( 'bert-base-uncased') and tokenizer.clean_up_tokenization_spaces: decoded_text = tokenizer.clean_up_tokenization(decoded_text) reference = tokenizer.decode(token_ids) if decoded_text == reference: num_perfect += 1 else: # For non-perfect matching cases, decoded_text should also be very similar to the reference assert similar(decoded_text, reference, 0.99) print(f"Perfect matching ratio: {num_perfect / num_samples * 100}%") assert num_perfect / num_samples >= threshold # TODO[chunweiy]: Move mixtral test to the e2e test def is_memory_enough_for_mixtral(): if torch.cuda.device_count() < 2: return False try: total_memory = get_total_gpu_memory(0) + get_total_gpu_memory(1) if total_memory >= 160 * 1024**3: return True except: return False def test_llm_generate_async(): _test_llm_generate_async() def _test_llm_generate_async(model_name=default_model_name, tp_size: int = 1, use_auto_parallel: bool = False, tokenizer=None): if "Mixtral" in model_name and use_auto_parallel: pytest.skip("Auto parallel is not supported for Mixtral models") tp_size = tp_size if not use_auto_parallel else 1 world_size = tp_size if use_auto_parallel else None llm = LLM( model=get_model_path(model_name), tokenizer=tokenizer, kv_cache_config=global_kvcache_config, tensor_parallel_size=tp_size, auto_parallel=use_auto_parallel, auto_parallel_world_size=world_size, ) sampling_params = SamplingParams(max_tokens=6) def test_async(streaming: bool): async def task(prompt: str): outputs = [] async for output in llm.generate_async( prompt, streaming=streaming, sampling_params=sampling_params): print('output', output) outputs.append(output.outputs[0].text) print(' '.join(outputs)) async def main(): tasks = [task(prompt) for prompt in prompts] await asyncio.gather(*tasks) asyncio.run(main()) def test_wait(streaming: bool): for prompt in prompts: future = llm.generate_async(prompt, streaming=streaming, sampling_params=sampling_params) for output in future: print('wait', output) def test_non_streaming_usage_wait(): for prompt in prompts: output = llm.generate_async(prompt, streaming=False, sampling_params=sampling_params) print(output.outputs[0].text) def test_future(streaming: bool): for prompt in prompts: future = llm.generate_async(prompt, streaming=streaming, sampling_params=sampling_params) if streaming is True: for output in future: # Do something else and then wait for the result if needed output = output.result(timeout=10) print('future', output.outputs[0].text) else: # Do something else and then wait for the result if needed output = future.result(timeout=10) print('future', output.outputs[0].text) def test_future_async(): async def task(prompt: str): future = llm.generate_async(prompt, streaming=False, sampling_params=sampling_params) output = await future.aresult() print('future', output.outputs[0].text) async def main(): tasks = [task(prompt) for prompt in prompts] await asyncio.gather(*tasks) asyncio.run(main()) test_async(streaming=True) test_async(streaming=False) test_wait(streaming=True) test_wait(streaming=False) test_future(streaming=True) test_future(streaming=False) test_future_async() test_non_streaming_usage_wait() del llm release_gc() @pytest.fixture(scope="module") def llm_for_sampling_params() -> LLM: llm = LLM( model=llama_model_path, kv_cache_config=global_kvcache_config, ) return llm def test_user_specify_workspace(): user_specified_ws_path = '/tmp/specified_workspace' shutil.rmtree(user_specified_ws_path, ignore_errors=True) os.mkdir(user_specified_ws_path) llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config, workspace=user_specified_ws_path) pre_built_engine_cfg = llm.args.model / 'config.json' assert pre_built_engine_cfg.exists() del llm release_gc() assert not pre_built_engine_cfg.exists() @force_ampere def test_generate_with_sampling_params_per_prompt(llm_for_sampling_params: LLM): llm = llm_for_sampling_params sampling_params_list = [ SamplingParams(end_id=-1, pad_id=-1) for _ in range(2) ] sampling_params_list[0].max_tokens = 4 sampling_params_list[1].max_tokens = 8 for i, output in enumerate( llm.generate(prompts, sampling_params=sampling_params_list)): output_len = len(output.outputs[0].token_ids) print(f"output_len: {output_len}") assert output_len <= sampling_params_list[i].max_tokens @force_ampere @pytest.fixture(scope="module") @pytest.mark.parametrize( "sampling_params", [ # temperature SamplingParams( max_tokens=6, temperature=0.5, beam_search_diversity_rate=0.5), # topK SamplingParams(max_tokens=6, top_k=10, top_p=0.92), # topP SamplingParams(max_tokens=6, top_p=0.92), # penalty SamplingParams(max_tokens=8, length_penalty=1.0, presence_penalty=0.0, repetition_penalty=1.0, min_tokens=5), # early stopping SamplingParams(max_tokens=6, early_stopping=5), # n-returns SamplingParams(max_tokens=6, n=2, top_k=2), SamplingParams(max_tokens=6, n=2, top_k=2, best_of=3), SamplingParams(max_tokens=6, n=3, use_beam_search=True), SamplingParams(max_tokens=6, n=2, best_of=3, use_beam_search=True), ]) def test_generate_with_SamplingConfig(llm_for_sampling_params: LLM, sampling_params: SamplingParams): llm = llm_for_sampling_params for output in llm.generate(prompts, sampling_params=sampling_params): print(output) assert len(output.outputs) == sampling_params.n @force_ampere def test_generate_with_beam_search(): build_config = BuildConfig() build_config.max_beam_width = 2 build_config.max_num_tokens = 20 llm_test_harness( llama_model_path, prompts, [["D E F G H I", "D E F G I J"]], build_config=build_config, kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4), sampling_params=SamplingParams(max_tokens=6, beam_width=2)) @force_ampere def test_generate_with_streaming_llm(): # TODO[chunweiy]: Test with larger size when the underlying support is ready build_config = BuildConfig() build_config.plugin_config.streamingllm = True build_config.max_batch_size = 8 build_config.max_seq_len = 512 kv_cache_config = KvCacheConfig(max_attention_window=[64], sink_token_length=4) # Check the plugin config is correctly set assert build_config.plugin_config.streamingllm is True sampling_params = SamplingParams(max_tokens=4) llm_test_harness(llama_model_path, prompts, ["D E F G"], sampling_params=sampling_params, build_config=build_config, kv_cache_config=kv_cache_config) def test_parallel_config(): config = _ParallelConfig() config.tp_size = 2 config.pp_size = 2 assert config.world_size == 4 config.world_size = 4 # should not raise exception with pytest.raises(ValueError): config.world_size = 5 @force_ampere # Save H100 resource @pytest.mark.parametrize("gather_context_logits", [True, False]) @pytest.mark.parametrize("gather_generation_logits", [True, False]) @pytest.mark.parametrize("return_log_probs", [True]) # prune space def test_generate_with_OutputConfig(gather_context_logits: bool, gather_generation_logits: bool, return_log_probs: bool): if not (gather_context_logits or gather_generation_logits): # prune space return build_config = BuildConfig() build_config.gather_context_logits = gather_context_logits build_config.gather_generation_logits = gather_generation_logits llm = LLM( model=llama_model_path, kv_cache_config=global_kvcache_config, build_config=build_config, ) sampling_params = SamplingParams( max_tokens=8, return_context_logits=gather_context_logits, return_generation_logits=gather_generation_logits, return_log_probs=return_log_probs) for output in llm.generate(prompts, sampling_params=sampling_params): if gather_context_logits: assert output.context_logits is not None assert len(prompts[0].split()) + \ 1 == output.context_logits.shape[0] if gather_generation_logits: assert output.outputs[0].generation_logits is not None assert sampling_params.max_tokens == output.outputs[ 0].generation_logits.shape[0] if return_log_probs: assert output.outputs[0].logprobs is not None print(output) @force_ampere def test_generate_with_stop_words(): llm = LLM( model=llama_model_path, kv_cache_config=global_kvcache_config, ) stop_id = llm.tokenizer.encode("N", add_special_tokens=False)[-1] llm_check_output(llm, prompts, ["D E F G H I J K L M"], sampling_params=SamplingParams(end_id=stop_id), finish_reasons=['stop'], stop_reasons=[None]) llm_check_output(llm, prompts, ["D E F G H"], sampling_params=SamplingParams(max_tokens=5), finish_reasons=['length'], stop_reasons=[None]) llm_check_output(llm, prompts, ["D E F G H I J K L M"], sampling_params=SamplingParams(stop_token_ids=[stop_id]), finish_reasons=['stop'], stop_reasons=[stop_id]) llm_check_output(llm, prompts, ["D E F G H I J K L M N"], sampling_params=SamplingParams( stop_token_ids=[stop_id], include_stop_str_in_output=True), finish_reasons=['stop'], stop_reasons=[stop_id]) llm_check_output(llm, prompts, ["D E F G H"], sampling_params=SamplingParams(stop="I J"), finish_reasons=['stop'], stop_reasons=["I J"]) llm_check_output(llm, prompts, ["D E F G H I J K L M"], sampling_params=SamplingParams(stop="I E", max_tokens=10), finish_reasons=['length'], stop_reasons=[None]) llm_check_output(llm, prompts, ["D E F G H I J"], sampling_params=SamplingParams( stop="I J", include_stop_str_in_output=True), finish_reasons=['stop'], stop_reasons=["I J"]) llm_check_output(llm, prompts, ["D E F G H"], sampling_params=SamplingParams(stop=["F E", "I J"], stop_token_ids=[stop_id]), finish_reasons=['stop'], stop_reasons=["I J"]) @force_ampere def test_generate_with_bad_words(): llm = LLM( model=llama_model_path, kv_cache_config=global_kvcache_config, ) bad_id = llm.tokenizer.encode("N", add_special_tokens=False)[-1] llm_check_output(llm, prompts, ["D E F G H I J K L M\n\nI hope this"], sampling_params=SamplingParams(max_tokens=15, bad_token_ids=[bad_id])) llm_check_output(llm, prompts, ["D E F G H I K L M N O P Q R S"], sampling_params=SamplingParams(max_tokens=15, bad="I J")) llm_check_output(llm, prompts, ["D E F G H I K L M N O P Q R S"], sampling_params=SamplingParams(max_tokens=15, bad=["F E", "I J"])) @force_ampere def test_generate_with_sampling_params_misc(): llm = LLM( model=llama_model_path, tokenizer_mode='slow', kv_cache_config=global_kvcache_config, ) fake_end_id = llm.tokenizer.encode("N", add_special_tokens=False)[-1] llm_check_output(llm, prompts, ["D E F G H I J K L M"], sampling_params=SamplingParams(max_tokens=15, end_id=fake_end_id)) llm_check_output(llm, prompts, ["D E F G H I K L M N O P Q R S"], sampling_params=SamplingParams(max_tokens=15, end_id=fake_end_id, ignore_eos=True)) llm_check_output(llm, prompts, [""], sampling_params=SamplingParams(max_tokens=15, end_id=fake_end_id, detokenize=False)) outputs = llm.generate(prompts) assert outputs[0].prompt_token_ids == [1, 319, 350, 315] outputs = llm.generate(prompts, SamplingParams(add_special_tokens=False)) assert outputs[0].prompt_token_ids == [319, 350, 315] outputs = llm.generate(prompts, SamplingParams(truncate_prompt_tokens=2)) assert outputs[0].prompt_token_ids == [1, 315] # Use embedding bias to force the output tokens to be special tokens unk_id = llm.tokenizer.encode('', add_special_tokens=False)[-1] vocab_size_padded = 32000 embedding_bias = torch.zeros(vocab_size_padded) embedding_bias[unk_id] = torch.finfo(torch.float32).max outputs = llm.generate( prompts, SamplingParams(max_tokens=5, embedding_bias=embedding_bias)) assert outputs[0].outputs[0].text == "" outputs = llm.generate( prompts, SamplingParams(max_tokens=5, embedding_bias=embedding_bias, skip_special_tokens=False, spaces_between_special_tokens=False)) assert outputs[0].outputs[0].text == "" outputs = llm.generate( prompts, SamplingParams(max_tokens=5, embedding_bias=embedding_bias, skip_special_tokens=False, spaces_between_special_tokens=True)) assert outputs[0].outputs[0].text == " " @force_ampere def test_generate_with_embedding_bias(): tokenizer = transformers.AutoTokenizer.from_pretrained(llama_model_path) biased_word_id = tokenizer.encode("Z", add_special_tokens=False)[-1] vocab_size_padded = 32000 embedding_bias = torch.zeros(vocab_size_padded) embedding_bias[biased_word_id] = torch.finfo(torch.float32).max sampling_params = SamplingParams(max_tokens=6, embedding_bias=embedding_bias) llm_test_harness( llama_model_path, prompts, ["Z Z Z Z Z Z"], sampling_params=sampling_params, kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4)) @force_ampere def test_generate_with_logits_post_processor(): tokenizer = TransformersTokenizer.from_pretrained(llama_model_path) biased_word_id = tokenizer.encode("Z", add_special_tokens=False)[-1] def logits_post_processor(req_id: int, logits: torch.Tensor, ids: List[List[int]], stream_ptr: int, client_id: Optional[int]): with torch.cuda.stream(torch.cuda.ExternalStream(stream_ptr)): logits[:] = float("-inf") logits[..., biased_word_id] = 0 sampling_params = SamplingParams(max_tokens=6, logits_post_processor_name="my_logits_pp") llm_test_harness( llama_model_path, prompts, ["Z Z Z Z Z Z"], sampling_params=sampling_params, kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4), logits_post_processor_map={"my_logits_pp": logits_post_processor}) def llama_v2_13b_lora_test_harness(**llm_kwargs): hf_model_dir = get_model_path("llama-models-v2/llama-v2-13b-hf") hf_lora_dir = get_model_path("llama-models-v2/chinese-llama-2-lora-13b") # For LoRA checkpoints with finetuned embedding and lm_head, lora_dir must be provided at build time. build_config = BuildConfig(lora_config=LoraConfig(lora_dir=[hf_lora_dir])) llm = LLM(hf_model_dir, tokenizer=hf_lora_dir, enable_lora=True, max_lora_rank=64, build_config=build_config, **llm_kwargs) prompts = [ "今天天气很好,我到公园的时候,", "今天天气很好,我到公园的时候,", ] references = [ "看见好多人们都看书,看书书看书书,看书书看书书书书书书", "发现公园里到处都是人,有的在跑步,有的在打羽毛球,还有的", ] lora_req = LoRARequest("Chinese", 1, hf_lora_dir) sampling_params = SamplingParams(max_tokens=20, add_special_tokens=False) outputs = llm.generate(prompts, sampling_params, lora_request=[None, lora_req]) for output, ref in zip(outputs, references): assert similar(output.outputs[0].text, ref) def llama_7b_multi_lora_test_harness(**llm_kwargs): hf_model_dir = get_model_path("llama-models/llama-7b-hf") hf_lora_dir1 = get_model_path("llama-models/luotuo-lora-7b-0.1") hf_lora_dir2 = get_model_path("llama-models/Japanese-Alpaca-LoRA-7b-v0") # For LoRA checkpoints without finetuned embedding and lm_head, we can either: # (1) specify lora_target_modules, or # (2) provide a lora_dir to infer the lora_target_modules. build_config = BuildConfig(lora_config=LoraConfig( lora_target_modules=['attn_q', 'attn_k', 'attn_v'])) llm = LLM(hf_model_dir, enable_lora=True, max_lora_rank=8, build_config=build_config, **llm_kwargs) prompts = [ "美国的首都在哪里? \n答案:", "美国的首都在哪里? \n答案:", "美国的首都在哪里? \n答案:", "アメリカ合衆国の首都はどこですか? \n答え:", "アメリカ合衆国の首都はどこですか? \n答え:", "アメリカ合衆国の首都はどこですか? \n答え:", ] references = [ "沃尔玛\n\n## 新闻\n\n* ", "美国的首都是华盛顿。\n\n美国的", "纽约\n\n### カンファレンスの", "Washington, D.C.\nWashington, D.C. is the capital of the United", "华盛顿。\n\n英国の首都是什", "ワシントン\nQ1. アメリカ合衆国", ] lora_req1 = LoRARequest("luotuo", 1, hf_lora_dir1) lora_req2 = LoRARequest("Japanese", 2, hf_lora_dir2) sampling_params = SamplingParams(max_tokens=20) outputs = llm.generate( prompts, sampling_params, lora_request=[None, lora_req1, lora_req2, None, lora_req1, lora_req2]) for output, ref in zip(outputs, references): assert similar(output.outputs[0].text, ref) @skip_less_than_40gb_memory def test_llama_v2_13b_lora(): llama_v2_13b_lora_test_harness() @skip_less_than_40gb_memory def test_llama_7b_multi_lora(): llama_7b_multi_lora_test_harness(max_loras=1, max_cpu_loras=8) @skip_less_than_40gb_memory def test_mixtral_8x7b_moe_tp_and_moe_ep(**llm_kwargs): mixtral_8x7b_dir = get_model_path("Mixtral-8x7B-v0.1") llm = LLM(mixtral_8x7b_dir, tensor_parallel_size=2, moe_expert_parallel_size=2, moe_tensor_parallel_size=1, **llm_kwargs) tmpdir = tempfile.TemporaryDirectory() llm.save(tmpdir.name) with open(os.path.join(tmpdir.name, "config.json"), "r") as f: # read the build_config and check if the parameters are correctly saved engine_config = json.load(f) pretrained_config = PretrainedConfig.from_dict( engine_config["pretrained_config"]) assert pretrained_config.mapping.moe_tp_size == 1 assert pretrained_config.mapping.moe_ep_size == 2 def llama_v2_7b_prompt_adapter_test_harness(**llm_kwargs): hf_model_dir = get_model_path("llama-models-v2/llama-v2-7b-hf") hf_prompt_adapter_dir = get_model_path("llama-models-v2/llama_tweet_ptune") llm = LLM(hf_model_dir, enable_prompt_adapter=True, max_prompt_adapter_token=8, **llm_kwargs) prompts = [ "Born in north-east France, Soyer trained as a", "Born in north-east France, Soyer trained as a", "Tweet text: I have complaints! Label: ", "Tweet text: I have complaints! Label: ", "Tweet text: I have no problems Label: ", "Tweet text: I have no problems Label: ", ] references = [ "painter at the École des Beaux-Arts in Paris. He was a member of the", "chef and has worked in the restaurant industry for 15 years.Ћ\nBorn in north", "1999.\nTweet text: I have complaints! Label: 19", "no complaint", "100%\nI have no problems Label: 100%\nI have no", "no complaint", ] pa_req = PromptAdapterRequest('tweet', 1, hf_prompt_adapter_dir) sampling_params = SamplingParams(max_tokens=20) outputs = llm.generate( prompts, sampling_params, prompt_adapter_request=[None, pa_req, None, pa_req, None, pa_req]) for output, ref in zip(outputs, references): assert similar(output.outputs[0].text, ref) @skip_less_than_40gb_memory def test_llama_v2_7b_prompt_adapter(): llama_v2_7b_prompt_adapter_test_harness() @force_ampere def test_generate_block_reuse(): build_config = BuildConfig() build_config.plugin_config._use_paged_context_fmha = True build_config.plugin_config._paged_kv_cache = True llm = LLM(model=llama_model_path, kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4, enable_block_reuse=True), build_config=build_config) sampling_params = SamplingParams(max_tokens=6) prompts = ["A B C", "A B C D"] for output in llm.generate(prompts, sampling_params=sampling_params): print(output) del llm release_gc() def test_executor_results_cleanup(): llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config) sampling_params = SamplingParams(max_tokens=6) for i in range(20): llm.generate(prompts, sampling_params=sampling_params) num_remaining_results = len(llm._executor._results) print(f"result.size: {num_remaining_results}") assert num_remaining_results == 0 del llm release_gc() @pytest.mark.parametrize("trust_remote_code", [True, False]) def _test_llm_trust_remote_code(trust_remote_code: bool): # OOM when tested with other cases # TODO[chunweiy]: Enable this later release_gc() if trust_remote_code: internlm_model_path = get_model_path("internlm-chat-7b") llm = LLM(model=internlm_model_path, trust_remote_code=trust_remote_code, tokenizer=TransformersTokenizer.from_pretrained( internlm_model_path, trust_remote_code=trust_remote_code), kv_cache_config=global_kvcache_config) sampling_params = SamplingParams(max_tokens=6, temperature=0.8, top_p=0.95) prompts = [ "The future of AI is", ] for output in llm.generate(prompts, sampling_params=sampling_params): print(output) del llm release_gc() else: with pytest.raises(ValueError): llm = LLM(model="internlm/internlm-chat-7b", trust_remote_code=trust_remote_code, tokenizer="internlm/internlm-chat-7b", kv_cache_config=global_kvcache_config) def test_llm_build_cache(): # Activate the build-cache cache_config = BuildCacheConfig(max_records=1, max_cache_storage_gb=10) sampling_params = SamplingParams(max_tokens=6) def first_run(): llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config, enable_build_cache=cache_config) llm_check_output(llm, prompts, ["D E F G H I J K"], sampling_params=sampling_params) del llm release_gc() def second_run(): llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config, enable_build_cache=cache_config) llm_check_output(llm, prompts, ["D E F G H I J K"], sampling_params=sampling_params) # the cache should be hit assert llm.llm_build_stats.cache_hitted, llm.llm_build_stats.cache_info del llm release_gc() first_run() second_run() class DummyError(Exception): pass class DummyExecutorMeta(type): def __new__(cls, name, bases, dic, worker_cls): new_cls = super().__new__(cls, name, bases, dic) @classmethod def create(cls, engine, executor_config, *args, **kwargs): return worker_cls(engine=engine, executor_config=executor_config) new_cls.create = create return new_cls class DummyExecutorWorker2(ExecutorBindingsWorker): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.counter = 0 def await_response_task(self) -> bool: self.counter += 1 if self.counter == 2: # raise exception on the third token print(f"To raise exception") raise DummyError("Test exception") return super().await_response_task() DummyExecutor2 = DummyExecutorMeta("DummyExecutor2", (), {}, worker_cls=DummyExecutorWorker2) def test_executor_process_background_error(): llm = LLM(model=llama_model_path, executor_cls=DummyExecutor2, kv_cache_config=global_kvcache_config) # The dummy executor will delay the responses sampling_params = SamplingParams(max_tokens=6) # test in streaming mode async def task(): with pytest.raises(DummyError): async for output in llm.generate_async( prompts[0], streaming=True, sampling_params=sampling_params): print(output) asyncio.run(task()) def test_llm_apidocs(): doc = LLM.__doc__ assert doc assert doc.find('pipeline_parallel_size') != -1 assert doc.find('tensor_parallel_size') != -1 assert doc.find('auto_parallel') != -1 def check_llm_return_context_logits(tp_size=1): build_config = BuildConfig(gather_context_logits=True) llm = LLM(llama_model_path, tensor_parallel_size=tp_size, kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4), build_config=build_config) sampling_params = SamplingParams(max_tokens=8, return_context_logits=True) prompts = ["A B C D E F G H I J K"] * 8 for output in llm.generate(prompts, sampling_params=sampling_params): assert isinstance(output.context_logits, torch.Tensor) print(output) del llm release_gc() def check_llm_return_generation_logits(tp_size=1): build_config = BuildConfig(gather_generation_logits=True) llm = LLM(llama_model_path, tensor_parallel_size=tp_size, kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4), build_config=build_config) sampling_params = SamplingParams(max_tokens=8, return_generation_logits=True) prompts = ["A B C D E F G H I J K"] * 8 for output in llm.generate(prompts, sampling_params=sampling_params): assert isinstance(output.outputs[0].generation_logits, torch.Tensor) print(output) del llm release_gc() def test_llm_return_context_logits(): check_llm_return_context_logits(tp_size=1) def test_llm_return_generation_logits(): check_llm_return_generation_logits(tp_size=1) class DummyExecutorWorker3(ExecutorBindingsWorker): should_raise_error = True def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.counter = 0 self.failed_requests = set() def _engine_response_callback(self, response: tllm.Response): if response.client_id in self.failed_requests: return None # Making the first response failed, and the subsequent responses successful if DummyExecutorWorker3.should_raise_error: DummyExecutorWorker3.should_raise_error = False print(f"Raise error for {response.client_id}") self.failed_requests.add(response.client_id) return tllm.Response( request_id=0, # dummy value client_id=response.client_id, error_msg="Test error") else: return response DummyExecutor3 = DummyExecutorMeta("DummyExecutor3", (), {}, worker_cls=DummyExecutorWorker3) def test_llm_handling_per_requeust_error(): llm = LLM(model=llama_model_path, executor_cls=DummyExecutor3, kv_cache_config=global_kvcache_config) # The dummy executor will delay the responses sampling_params = SamplingParams(max_tokens=6) def batch_task(): DummyExecutorWorker3.should_raise_error = True with pytest.raises(RequestError): for output in llm.generate(prompts, sampling_params=sampling_params): print(output) for output in llm.generate(prompts, sampling_params=sampling_params): print(output) batch_task() def test_llm_handling_per_requeust_error_async(): llm = LLM(model=llama_model_path, executor_cls=DummyExecutor3, kv_cache_config=global_kvcache_config) # The dummy executor will delay the responses sampling_params = SamplingParams(max_tokens=6) # test in streaming mode async def task(): # 10 requests, each request will get error, while the whole LLM instance is still alive with pytest.raises(RequestError): DummyExecutorWorker3.should_raise_error = True async for output in llm.generate_async( prompts[0], streaming=True, sampling_params=sampling_params): print(output) DummyExecutorWorker3.should_raise_error = False async for output in llm.generate_async(prompts[0], streaming=True, sampling_params=sampling_params): print(output) asyncio.run(task()) def test_llm_get_stats(tp_size: int = 1): llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config, tensor_parallel_size=tp_size, fast_build=True) sampling_params = SamplingParams(max_tokens=100) for output in llm.generate(prompts, sampling_params=sampling_params): print(output) while True: try: stats = llm._get_stats(2) print(stats) except NoStatsAvailable: break def test_llm_get_stats_async(tp_size: int = 1): llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config, tensor_parallel_size=tp_size, fast_build=True) sampling_params = SamplingParams(max_tokens=6) async def task0(): async for output in llm.generate_async(prompts[0], streaming=True, sampling_params=sampling_params): print(output) async def task1(): while True: try: stats = await llm._get_stats_async(2) print(stats) except NoStatsAvailable: break async def main(): await asyncio.gather(task0(), task1()) asyncio.run(main()) def _test_llm_capture_request_error(tp_size: int = 1): build_config = BuildConfig() build_config.max_num_tokens = 64 llm = LLM( model=llama_model_path, build_config=build_config, ) prompt = 'A ' * 65 # the minimum max_num_tokens is 64 with pytest.raises(RequestError): llm.generate(prompt) def test_llm_capture_request_error(): _test_llm_capture_request_error(tp_size=1) def test_llm_api_jupyter_scenario(): llm = LLM( model=llama_model_path, kv_cache_config=global_kvcache_config, ) sampling_params = SamplingParams(temperature=0.8, top_p=0.95) async def task(): return llm.generate(["A", "B", "C", "D"], sampling_params) output = asyncio.run(task()) for token in output: print(token) if __name__ == '__main__': test_llm_capture_request_error()