import json import random import time from contextlib import contextmanager, nullcontext from typing import Optional import pytest from tensorrt_llm import LLM from tensorrt_llm.disaggregated_params import DisaggregatedParams from tensorrt_llm.executor import GenerationExecutorWorker from tensorrt_llm.executor.rpc_proxy import GenerationExecutorRpcProxy from tensorrt_llm.llmapi import CacheTransceiverConfig, KvCacheConfig from tensorrt_llm.llmapi.llm_args import NGramDecodingConfig, PeftCacheConfig from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer from tensorrt_llm.metrics import MetricNames from tensorrt_llm.sampling_params import SamplingParams # isort: off from .lora_test_utils import ( check_llama_7b_multi_lora_from_request_test_harness, check_llama_7b_multi_unique_lora_adapters_from_request, create_mock_nemo_lora_checkpoint) from .test_llm import (_test_llm_capture_request_error, get_model_path, global_kvcache_config, llama_model_path, llm_get_stats_async_test_harness, llm_get_stats_test_harness, llm_return_logprobs_test_harness, llm_test_harness, prompts, run_llm_abort_request, run_llm_with_postprocess_parallel_and_result_handler, tinyllama_logits_processor_test_harness) from utils.util import (force_ampere, similar, skip_fp8_pre_ada, skip_gpu_memory_less_than_40gb, skip_gpu_memory_less_than_80gb, skip_gpu_memory_less_than_138gb, skip_ray) from utils.llm_data import llm_models_root from tensorrt_llm.lora_helper import LoraConfig from tensorrt_llm.executor.request import LoRARequest import tempfile import torch from peft import LoraConfig as PeftLoraConfig from peft import get_peft_model from transformers import AutoModelForCausalLM, AutoTokenizer # isort: on @force_ampere @pytest.mark.parametrize("enable_chunked_prefill,", [False, True]) @pytest.mark.part2 def test_tinyllama_logits_processor(enable_chunked_prefill): tinyllama_logits_processor_test_harness( backend="pytorch", enable_chunked_prefill=enable_chunked_prefill) @skip_ray @pytest.mark.parametrize( "return_context_logits, use_overlap, enable_chunked_prefill, enable_iter_req_stats", [ (False, False, False, True), (False, False, True, True), (False, True, False, True), (False, True, True, True), ]) @pytest.mark.part0 def test_llm_get_stats(return_context_logits, use_overlap, enable_chunked_prefill, enable_iter_req_stats): llm_get_stats_test_harness(tp_size=1, pp_size=1, return_context_logits=return_context_logits, pytorch_backend=True, use_overlap=use_overlap, enable_chunked_prefill=enable_chunked_prefill, enable_iter_req_stats=enable_iter_req_stats) @skip_ray @pytest.mark.parametrize( "return_context_logits, use_overlap, enable_chunked_prefill, enable_iter_req_stats", [ (False, False, False, True), (False, False, True, True), (False, True, False, True), (False, True, True, True), ]) @pytest.mark.part1 def test_llm_get_stats_async(return_context_logits, use_overlap, enable_chunked_prefill, enable_iter_req_stats): llm_get_stats_async_test_harness( tp_size=1, pp_size=1, return_context_logits=return_context_logits, pytorch_backend=True, use_overlap=use_overlap, enable_chunked_prefill=enable_chunked_prefill, enable_iter_req_stats=enable_iter_req_stats) @pytest.mark.part1 def test_llm_capture_request_error(): _test_llm_capture_request_error(pytorch_backend=True, tp_size=1) @force_ampere @pytest.mark.mpi_ray_parity @pytest.mark.parametrize( "sampling_params", [ SamplingParams() # pytorch only supports n=1 ]) @pytest.mark.part0 def test_llm_abort_request(sampling_params): llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config) run_llm_abort_request(llm=llm, sampling_params=sampling_params) @contextmanager def _validate_invalid_token_error_scope(): with pytest.raises(RuntimeError) as exc_info: yield assert "Token ID out of range" in str(exc_info.value) @force_ampere @pytest.mark.part1 def test_llm_invalid_input_token(): llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config) prompts = [ [-1], ] # NB: exc_info in _validate_invalid_token_error_scope creates a reference # to a traceback which outlives the scope of 'exc_info' and prevents # deletion of 'llm'. However, using the context manager protocol is # anyways more robust than delegating cleanup to __del__. with llm: with _validate_invalid_token_error_scope(): llm.generate( prompts, sampling_params=SamplingParams(max_tokens=5), ) @force_ampere @pytest.mark.part0 def test_llm_invalid_input_token_async(): llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config) # NB: exc_info in _validate_invalid_token_error_scope creates a reference # to a traceback which outlives the scope of 'exc_info' and prevents # deletion of 'llm'. However, using the context manager protocol is # anyways more robust than delegating cleanup to __del__. with llm: prompts = [ [-1], [42], ] fail_idx = [0] for submit_order in [[0, 1], [1, 0]]: for collect_order in [[0, 1], [1, 0]]: print(f"submitting {submit_order}") futures = [ llm.generate_async( prompts[submit_idx], sampling_params=SamplingParams(max_tokens=5), ) for submit_idx in submit_order ] for collect_idx in collect_order: with _validate_invalid_token_error_scope( ) if submit_order[collect_idx] in fail_idx else nullcontext( ): print( f"collect order {collect_order}, collecting {collect_idx}" ) futures[collect_idx].result() @pytest.mark.part2 def test_llm_reward_model(): rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B") tokenizer = TransformersTokenizer.from_pretrained(rm_model_path) tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"] llm = LLM(model=rm_model_path, attn_backend="VANILLA", disable_overlap_scheduler=True) sampling_params = SamplingParams(return_context_logits=True) outputs = llm.generate(prompts, sampling_params) scores = outputs[0].context_logits print(scores) assert scores.shape == (tokenized_input.shape[1], 2) assert not outputs[0].outputs[0].text @skip_ray @pytest.mark.part3 def test_llm_perf_metrics(): with LLM(model=llama_model_path, kv_cache_config=global_kvcache_config) as llm: sampling_params = SamplingParams(max_tokens=10, return_perf_metrics=True) outputs = llm.generate(prompts, sampling_params) assert outputs[0].outputs[0].request_perf_metrics is not None perf_metrics = outputs[0].outputs[0].request_perf_metrics timing_metrics = perf_metrics.timing_metrics assert timing_metrics.arrival_time < timing_metrics.first_scheduled_time assert timing_metrics.first_scheduled_time < timing_metrics.first_token_time assert timing_metrics.first_token_time < timing_metrics.last_token_time kv_cache_metrics = perf_metrics.kv_cache_metrics assert kv_cache_metrics.num_total_allocated_blocks == 1 assert kv_cache_metrics.num_new_allocated_blocks == 1 assert kv_cache_metrics.num_reused_blocks == 0 assert kv_cache_metrics.num_missed_blocks == 1 assert kv_cache_metrics.kv_cache_hit_rate == 0 assert perf_metrics.first_iter is not None assert perf_metrics.iter - perf_metrics.first_iter == sampling_params.max_tokens - 1 assert perf_metrics.last_iter == perf_metrics.iter @skip_ray @pytest.mark.part3 def test_llm_prometheus(): test_prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", ] sampling_params = SamplingParams(max_tokens=10, temperature=0.8, top_p=0.95) llm = LLM(model=llama_model_path, return_perf_metrics=True, kv_cache_config=global_kvcache_config) for test_prompt in test_prompts: request_output = llm.generate(test_prompt, sampling_params) assert request_output.metrics_dict is not None assert MetricNames.REQUEST_QUEUE_TIME in request_output.metrics_dict assert MetricNames.TPOT in request_output.metrics_dict assert MetricNames.TTFT in request_output.metrics_dict assert MetricNames.E2E in request_output.metrics_dict assert request_output.outputs is not None @skip_ray @pytest.mark.parametrize("streaming", [True, False]) @pytest.mark.part3 def test_llm_with_postprocess_parallel_and_result_handler(streaming): run_llm_with_postprocess_parallel_and_result_handler(streaming, "pytorch", tp_size=1) @pytest.mark.part0 def test_embedding_bias_with_torch_sampler_strategies(): """Test embedding bias application in TorchSampler.""" tokenizer = AutoTokenizer.from_pretrained(llama_model_path) biased_word_id = tokenizer.encode("Z", add_special_tokens=False)[-1] vocab_size_padded = 32000 embedding_bias = torch.zeros(vocab_size_padded) embedding_bias[biased_word_id] = torch.finfo(torch.float32).max sampling_kwargs = { "max_tokens": 6, "embedding_bias": embedding_bias, } # All test cases use greedy sampling for simplicity sampling_params = SamplingParams(**sampling_kwargs) llm_test_harness( llama_model_path, prompts, ["Z Z Z Z Z Z"], sampling_params=sampling_params, backend="pytorch", ) def llama_7b_lora_from_dir_test_harness(**llm_kwargs) -> None: lora_config = LoraConfig( lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"], max_lora_rank=8, max_loras=2, max_cpu_loras=2) llm = LLM( model=f"{llm_models_root()}/llama-models/llama-7b-hf", lora_config=lora_config, # Disable CUDA graph # TODO: remove this once we have a proper fix for CUDA graph in LoRA cuda_graph_config=None, **llm_kwargs) try: prompts = [ "美国的首都在哪里? \n答案:", ] references = [ "美国的首都是华盛顿。\n\n美国的", ] sampling_params = SamplingParams(max_tokens=20) lora_req = LoRARequest( "task-0", 0, f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1") lora_request = [lora_req] outputs = llm.generate(prompts, sampling_params, lora_request=lora_request) assert similar(outputs[0].outputs[0].text, references[0]) finally: llm.shutdown() @skip_gpu_memory_less_than_40gb @pytest.mark.part0 def test_llama_7b_lora(): llama_7b_lora_from_dir_test_harness() @skip_gpu_memory_less_than_40gb def test_llama_7b_lora_default_modules() -> None: lora_config = LoraConfig(max_lora_rank=64, max_loras=2, max_cpu_loras=2) hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" llm = LLM( model=hf_model_dir, lora_config=lora_config, # Disable CUDA graph # TODO: remove this once we have a proper fix for CUDA graph in LoRA cuda_graph_config=None) hf_lora_dir = f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1" try: prompts = [ "美国的首都在哪里? \n答案:", ] references = [ "美国的首都是华盛顿。\n\n美国的", ] sampling_params = SamplingParams(max_tokens=20, add_special_tokens=False) lora_req = LoRARequest("luotuo", 1, hf_lora_dir) lora_request = [lora_req] outputs = llm.generate(prompts, sampling_params, lora_request=lora_request) assert similar(outputs[0].outputs[0].text, references[0]) finally: llm.shutdown() def _check_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call: list[int], max_loras: int, max_cpu_loras: int, repeat_calls: int, repeats_per_call: int): # For LoRA checkpoints without finetuned embedding and lm_head, we can either: # (1) specify lora_target_modules, or # (2) provide a lora_dir to infer the lora_target_modules. lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], max_lora_rank=8, max_loras=max_loras, max_cpu_loras=max_cpu_loras) check_llama_7b_multi_unique_lora_adapters_from_request( lora_adapter_count_per_call, repeat_calls, repeats_per_call, LLM, lora_config=lora_config, # Disable CUDA graph # TODO: remove this once we have a proper fix for CUDA graph in LoRA cuda_graph_config=None) @skip_gpu_memory_less_than_40gb @skip_ray # https://nvbugs/5682551 @pytest.mark.part3 def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache(): """Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single llm.generate call, that's repeated twice. """ # noqa: D205 _check_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call=[2], max_loras=1, max_cpu_loras=2, repeat_calls=2, repeats_per_call=3) @skip_gpu_memory_less_than_40gb @pytest.mark.part1 def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache(): """Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU cache size < LoRA CPU cache size. """ # noqa: D205 _check_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call=[2, 2, 2], max_loras=1, max_cpu_loras=3, repeat_calls=1, repeats_per_call=1) @skip_gpu_memory_less_than_40gb @pytest.mark.part0 def test_llama_7b_multi_lora_read_from_cache_after_insert(): """Test that loading and then using the same adapters loaded in cache works.""" _check_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call=[3], max_loras=3, max_cpu_loras=3, repeat_calls=2, repeats_per_call=1) @skip_gpu_memory_less_than_40gb @pytest.mark.part3 def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_cache( ): """Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU cache over multiple llm.generate call repeated twice (two calls with the same requests): At the end of the 1st llm.generate call: The LoRA caches should contain adapters 1, 2 and shouldn't contain adapter 0 (it should have been evicted). So in the 2nd call, the worker should: - Send req0 with adapter 0 weights (because it was previously evicted) - Send the other two requests without their adapter weights as they're already in LoRA CPU cache Then, handling of req0 that has weights but not in the cache should evict one of the other two adapters from the cache, causing that evicted adapter's request to again load its weights from the file system, as they aren't with the request and aren't in LoRA cache. """ # noqa: D205 _check_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call=[3], max_loras=2, max_cpu_loras=2, repeat_calls=2, repeats_per_call=1) @skip_gpu_memory_less_than_40gb @pytest.mark.part2 def test_llama_7b_peft_cache_config_affects_peft_cache_size(): """Tests that LLM arg of peft_cache_config affects the peft cache sizes. NOTE: The caller can't get the actual LoRA cache sizes, so we instead we test that it fails when configured with a value too small to contain a single adapter. """ # For LoRA checkpoints without finetuned embedding and lm_head, we can either: # (1) specify lora_target_modules, or # (2) provide a lora_dir to infer the lora_target_modules. lora_config_no_cache_size_values = LoraConfig( lora_target_modules=['attn_q', 'attn_k', 'attn_v'], max_lora_rank=8) # Test that too small PeftCacheConfig.host_cache_size causes failure with pytest.raises(RuntimeError): check_llama_7b_multi_lora_from_request_test_harness( LLM, lora_config=lora_config_no_cache_size_values, peft_cache_config=PeftCacheConfig( host_cache_size=1), # size in bytes # Disable CUDA graph # TODO: remove this once we have a proper fix for CUDA graph in LoRA cuda_graph_config=None) # Test that too small PeftCacheConfig.device_cache_percent causes failure with pytest.raises(RuntimeError): check_llama_7b_multi_lora_from_request_test_harness( LLM, lora_config=lora_config_no_cache_size_values, peft_cache_config=PeftCacheConfig(device_cache_percent=0.0000001), # Disable CUDA graph # TODO: remove this once we have a proper fix for CUDA graph in LoRA cuda_graph_config=None) @skip_ray # https://nvbugs/5682551 @skip_gpu_memory_less_than_40gb @pytest.mark.part1 def test_llama_7b_lora_config_overrides_peft_cache_config(): """Tests that cache size args in lora_config LLM arg override the cache size parameters in peft_cache_config LLM arg. """ # noqa: D205 check_llama_7b_multi_lora_from_request_test_harness( LLM, lora_config=LoraConfig( lora_target_modules=['attn_q', 'attn_k', 'attn_v'], max_lora_rank=8, max_loras=2, max_cpu_loras=2), peft_cache_config=PeftCacheConfig( host_cache_size=1, # size in bytes device_cache_percent=0.0000001), # Disable CUDA graph # TODO: remove this once we have a proper fix for CUDA graph in LoRA cuda_graph_config=None) # TODO smor: currently Nemotron-Super-49B-v1 with LoRA memory consumption is overly high # https://jirasw.nvidia.com/browse/TRTLLM-5045 @pytest.mark.skip(reason="https://nvbugs/5448464") @skip_gpu_memory_less_than_138gb @pytest.mark.part1 def test_nemotron_nas_lora() -> None: lora_config = LoraConfig(lora_dir=[ f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64" ], max_lora_rank=64, max_loras=1, max_cpu_loras=1) llm = LLM( model= f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1", lora_config=lora_config, ) prompts = [ "Hello, how are you?", "Hello, how are you?", ] sampling_params = SamplingParams(max_tokens=10, add_special_tokens=False) lora_req = LoRARequest( "task-0", 0, f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64" ) lora_request = [lora_req, None] outputs = llm.generate(prompts, sampling_params, lora_request=lora_request) assert similar(outputs[0].outputs[0].text, outputs[1].outputs[0].text) @skip_gpu_memory_less_than_80gb @pytest.mark.part0 def test_llama_3_1_8b_fp8_with_bf16_lora() -> None: skip_fp8_pre_ada(use_fp8=True) model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8" lora_dir = f"{llm_models_root()}/lora/llama-3-chinese-8b-instruct-v2-lora" prompt = "美国的首都是哪里?" reference = "华盛顿特区。华盛顿特区是美国的首都和一个行政区" lora_config = LoraConfig(lora_dir=[lora_dir], max_lora_rank=64, max_loras=2, max_cpu_loras=2) lora_req = LoRARequest("lora-chinese", 0, lora_dir) llm = LLM( model_dir, lora_config=lora_config, # Disable CUDA graph # TODO: remove this once we have a proper fix for CUDA graph in LoRA cuda_graph_config=None) try: output = llm.generate(prompt, SamplingParams(max_tokens=20), lora_request=[lora_req]) finally: llm.shutdown() assert similar(output.outputs[0].text, reference) @skip_gpu_memory_less_than_80gb @pytest.mark.part2 def test_bielik_11b_v2_2_instruct_multi_lora() -> None: model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct" target_modules = ['attn_q', 'attn_k', 'attn_v'] # Set up temporary directory for LoRA adapters with tempfile.TemporaryDirectory() as lora_dir: print("Creating dummy LoRAs...") model = AutoModelForCausalLM.from_pretrained(model_dir, dtype=torch.bfloat16, device_map="auto") hf_modules = ["q_proj", "k_proj", "v_proj"] peft_lora_config = PeftLoraConfig(r=8, target_modules=hf_modules, bias="none", task_type="CAUSAL_LM") lora_paths = [] for i in range(2): lora_model = get_peft_model(model, peft_lora_config) for param in lora_model.parameters(): param.data.zero_() lora_path = f"{lora_dir}/lora_{i}" lora_model.save_pretrained(lora_path) lora_paths.append(lora_path) trtllm_lora_config = LoraConfig(lora_target_modules=target_modules, max_lora_rank=8, max_loras=2, max_cpu_loras=2) llm = LLM( model_dir, lora_config=trtllm_lora_config, # Disable CUDA graph # TODO: remove this once we have a proper fix for CUDA graph in LoRA cuda_graph_config=None) prompts = [ "Kim był Mikołaj Kopernik i z czego zasłynął?", "Gdzie znajduje się stolica Polski?", ] lora_req1 = LoRARequest("lora-1", 0, lora_paths[0]) lora_req2 = LoRARequest("lora-2", 1, lora_paths[1]) lora_requests = [lora_req1, lora_req2] sampling_params = SamplingParams(max_tokens=200) outputs = llm.generate(prompts, sampling_params, lora_request=lora_requests) assert len(outputs) == 2 @pytest.mark.part2 def test_gemma3_1b_instruct_multi_lora() -> None: model_dir = f"{llm_models_root()}/gemma/gemma-3-1b-it" target_modules = ['attn_q', 'attn_k', 'attn_v'] # Set up temporary directory for LoRA adapters with tempfile.TemporaryDirectory() as lora_dir: print("Creating dummy LoRAs...") model = AutoModelForCausalLM.from_pretrained(model_dir, dtype=torch.bfloat16, device_map="auto") hf_modules = ["q_proj", "k_proj", "v_proj"] peft_lora_config = PeftLoraConfig(r=8, target_modules=hf_modules, bias="none", task_type="CAUSAL_LM") lora_paths = [] for i in range(2): lora_model = get_peft_model(model, peft_lora_config) for param in lora_model.parameters(): param.data.zero_() lora_path = f"{lora_dir}/lora_{i}" lora_model.save_pretrained(lora_path) lora_paths.append(lora_path) trtllm_lora_config = LoraConfig(lora_dir=lora_paths, lora_target_modules=target_modules, max_lora_rank=8, max_loras=2, max_cpu_loras=2) # Disabling kv cache reuse as a WAR to deal with gaps in kernel support for Gemma3's non-inclusive sliding window size. kv_cache_config = KvCacheConfig( enable_block_reuse=False, enable_partial_reuse=False, ) llm = LLM(model_dir, lora_config=trtllm_lora_config, kv_cache_config=kv_cache_config) prompts = [ "Is it ok to fill diesel in a petrol car?", "What is the capital of France?", ] lora_req1 = LoRARequest("lora-1", 0, lora_paths[0]) lora_req2 = LoRARequest("lora-2", 1, lora_paths[1]) lora_requests = [lora_req1, lora_req2] sampling_params = SamplingParams(max_tokens=200) outputs = llm.generate(prompts, sampling_params, lora_request=lora_requests) assert len(outputs) == 2 @pytest.mark.parametrize( "lora_rank,max_lora_rank,description", [ # (lora_rank, max_lora_rank, description) (8, 8, "rank_8"), (16, 16, "rank_16"), (4, 8, "rank_4_max_8"), ]) @pytest.mark.part3 def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank, description): """Test load_torch_nemo_lora function with different LoRA rank configurations.""" from tensorrt_llm.lora_manager import load_torch_nemo_lora nemo_path = create_mock_nemo_lora_checkpoint( tmp_path, hidden_size=2048, num_layers=16, lora_rank=lora_rank, ) lora_config = LoraConfig( lora_dir=[str(nemo_path)], lora_ckpt_source="nemo", max_lora_rank=max_lora_rank, ) # This should not raise an error load_torch_nemo_lora(lora_config) assert lora_config.lora_target_modules == [ "attn_qkv" ], f"Expected attn_qkv modules for {description}" assert lora_config.trtllm_modules_to_hf_modules == { "attn_qkv": "attn_qkv" }, f"Expected correct module mapping for {description}" @pytest.mark.part0 def test_nemo_lora_unsupported_modules_validation(tmp_path): """Test validation of unsupported modules in NeMo LoRA.""" from tensorrt_llm.lora_manager import load_torch_nemo_lora nemo_path = create_mock_nemo_lora_checkpoint( tmp_path, hidden_size=2048, num_layers=16, lora_rank=8, ) # Test validation: should fail with unsupported modules invalid_config = LoraConfig( lora_dir=[str(nemo_path)], lora_ckpt_source="nemo", lora_target_modules=["attn_qkv", "mlp_h_to_4h"], # mlp_h_to_4h not supported max_lora_rank=8, ) with pytest.raises(ValueError, match="NeMo LoRA only supports"): load_torch_nemo_lora(invalid_config) @force_ampere @pytest.mark.part1 def test_gqa_nemo_lora(tmp_path): """ Test NeMo-format LoRA checkpoint loading and GQA support in TinyLlama. This test verifies two properties: 1. That a NeMo-format LoRA checkpoint with GQA (grouped query attention) can be loaded and applied to a TinyLlama model, and that generation with this LoRA produces a deterministic, expected output for a fixed prompt and temperature=0.0. 2. That the LoRA weights have a significant effect: generating with LoRA produces a different output than generating without LoRA, confirming that the LoRA adapter is actually being applied. The test uses a deterministic dummy LoRA checkpoint (seed=42) and checks both the positive (LoRA applied) and negative (no LoRA) cases for output text. """ # TinyLlama's exact GQA configuration hidden_size = 2048 num_layers = 22 num_q_heads = 32 # Query attention heads num_kv_heads = 4 # Key/Value heads (GQA) lora_rank = 8 nemo_path = create_mock_nemo_lora_checkpoint( tmp_path, hidden_size=hidden_size, num_layers=num_layers, lora_rank=lora_rank, num_attention_heads=num_q_heads, num_kv_heads=num_kv_heads, seed=42, # NOTE: the seed=42 is important for the test to pass. ) expected_lora_text_output = "Paris. The capital of France is Paris. The" test_prompts = ["The capital of France is"] sampling_params = SamplingParams(max_tokens=10, temperature=0.0) lora_config = LoraConfig( lora_dir=[str(nemo_path)], lora_ckpt_source="nemo", max_lora_rank=lora_rank, ) model_path = get_model_path("llama-models-v2/TinyLlama-1.1B-Chat-v1.0") llm = LLM( model=model_path, lora_config=lora_config, kv_cache_config=global_kvcache_config, ) try: lora_req = LoRARequest("tinyllama-gqa-test", 0, str(nemo_path), lora_ckpt_source="nemo") lora_outputs = llm.generate(test_prompts, sampling_params, lora_request=[lora_req]) # For the above deterministic dummy LoRA checkpoint, # with temperature=0.0, # the expected output text should always be the same. assert lora_outputs[0].outputs[0].text == expected_lora_text_output, \ f"Expected output text: {expected_lora_text_output}, " \ f"got: {lora_outputs[0].outputs[0].text}" assert len(lora_outputs) == 1 # Generate without LoRA. # The LoRA weights are tuned/large enough that # they differ from a no-LoRA run. base_outputs = llm.generate(test_prompts, sampling_params) assert base_outputs[0].outputs[0].text != expected_lora_text_output, \ f"No-LoRA output should differ from expected output text: {expected_lora_text_output}, " \ f"got: {base_outputs[0].outputs[0].text}" finally: llm.shutdown() class TestLlmError: @pytest.mark.part3 def test_max_num_token_check(self): """ LLM should raise error when got prompt length exceed the valid range. """ llm = LLM(llama_model_path, kv_cache_config=global_kvcache_config, max_num_tokens=100) with pytest.raises(ValueError, match="should not exceed max_num_tokens"): ids = [random.randint(10, 100) for _ in range(101)] llm.generate([ids]) class FailingExecutorWorker(GenerationExecutorWorker): """Mock worker that fails during initialization to test error handling.""" def __init__(self, *args, **kwargs): # Simulate a constructor failure raise RuntimeError( "Mock GenerationExecutorWorker initialization failed") FailingExecutor = type( "FailingExecutor", (), { "create": classmethod( lambda cls, *args, **kwargs: FailingExecutorWorker(*args, **kwargs)) }) @skip_ray @pytest.mark.part2 def test_llm_with_proxy_error(): """Test that LLM properly handles GenerationExecutorWorker constructor failures. This test mocks the GenerationExecutorWorker to fail during __init__ and verifies that the LLM class properly catches and re-raises the error. """ from unittest.mock import patch # Test that the error is properly caught and re-raised by LLM # We patch GenerationExecutor.create directly to return our failing worker with patch('tensorrt_llm.executor.executor.GenerationExecutor.create', side_effect=lambda *args, **kwargs: FailingExecutorWorker( *args, **kwargs)): with pytest.raises( RuntimeError, match="Mock GenerationExecutorWorker initialization failed"): llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config) @pytest.mark.part0 @pytest.mark.parametrize("use_speculative", [True, False]) def test_min_tokens(use_speculative: bool): """Check min_tokens is respected.""" llm_common_config = dict( model=llama_model_path, max_batch_size=2, kv_cache_config=global_kvcache_config, max_num_tokens=2048, ) if use_speculative: spec_config = NGramDecodingConfig( max_draft_len=4, max_matching_ngram_size=2, is_keep_all=True, is_use_oldest=True, is_public_pool=True, ) llm = LLM(**llm_common_config, speculative_config=spec_config) else: llm = LLM(**llm_common_config) output_len = 2000 sampling_params = SamplingParams(max_tokens=output_len, min_tokens=output_len, temperature=1) res = llm.generate("The end.", sampling_params=sampling_params) assert len(res.outputs) == 1 assert len(res.outputs[0].token_ids) == output_len @skip_ray @pytest.mark.parametrize( "prompt_logprobs, logprobs, return_context_logits, return_generation_logits, backend", [ (2, None, True, False, "pytorch"), # prompt_logprobs with context_logits (None, 1, False, False, "pytorch"), # generation logprobs only (top-1, PyTorch limit) (2, None, False, False, "pytorch"), # prompt_logprobs without context_logits (None, None, False, False, "pytorch"), # no logprobs at all ]) def test_llm_return_logprobs(prompt_logprobs: Optional[int], logprobs: Optional[int], return_context_logits: bool, return_generation_logits: bool, backend: str): llm_return_logprobs_test_harness(prompt_logprobs, logprobs, return_context_logits, return_generation_logits, backend=backend) @skip_ray @pytest.mark.parametrize( "prompt_logprobs, logprobs, return_context_logits, return_generation_logits", [ (None, 1, False, False), # generation logprobs only (top-1, PyTorch limit) (2, None, True, False), # prompt_logprobs with context_logits (2, None, False, False), # prompt_logprobs only (2, 1, False, False), # both prompt and generation logprobs (2, 3, False, False), # both prompt and generation logprobs ]) def test_llm_return_logprobs_streaming(prompt_logprobs, logprobs, return_context_logits, return_generation_logits): llm_return_logprobs_test_harness(prompt_logprobs, logprobs, return_context_logits, return_generation_logits, streaming=True, backend="pytorch") class TestLlmError: @pytest.mark.part3 def test_max_num_token_check(self): """ LLM should raise error when got prompt length exceed the valid range. """ llm = LLM(llama_model_path, kv_cache_config=global_kvcache_config, max_num_tokens=100) with pytest.raises(ValueError, match="should not exceed max_num_tokens"): ids = [random.randint(10, 100) for _ in range(101)] llm.generate([ids]) @skip_ray @pytest.mark.parametrize("num_requests", [1, 5, 10]) def test_llm_rpc(num_requests: int): # TODO: remove the with-statement when shutdown hang issue is fixed with LLM(model=llama_model_path, kv_cache_config=global_kvcache_config, orchestrator_type="rpc") as llm: assert isinstance(llm._executor, GenerationExecutorRpcProxy) res = llm.generate("Tell me a joke", sampling_params=SamplingParams(max_tokens=10, end_id=-1)) print(f"get result: {res}") assert len(res.outputs) == 1 assert len(res.outputs[0].token_ids) == 10 @skip_ray @pytest.mark.asyncio async def test_llm_rpc_streaming(): # TODO: remove the with-statement when shutdown hang issue is fixed with LLM(model=llama_model_path, kv_cache_config=global_kvcache_config, orchestrator_type="rpc") as llm: assert isinstance(llm._executor, GenerationExecutorRpcProxy) outputs = [] async for output in llm.generate_async("Tell me a joke", sampling_params=SamplingParams( max_tokens=10, end_id=-1), streaming=True): outputs.append(output.outputs[0].text) "".join(outputs) print(f"get result: {outputs}") @skip_ray def test_llm_rpc_get_stats(): """Test that get_stats works with RPC orchestrator.""" with LLM(model=llama_model_path, kv_cache_config=global_kvcache_config, enable_iter_perf_stats=True, orchestrator_type="rpc") as llm: assert isinstance(llm._executor, GenerationExecutorRpcProxy) # Generate some output to produce stats for output in llm.generate( prompts, sampling_params=SamplingParams(max_tokens=5)): print(output) stats = llm.get_stats(timeout=5) assert len(stats) > 0, "Should have at least one stats entry" # Stats should be JSON strings that can be parsed parsed = json.loads(stats[0]) if isinstance(stats[0], str) else stats[0] assert "iter" in parsed, "Stats should contain 'iter' field" assert "cpuMemUsage" in parsed, "Stats should contain 'cpuMemUsage' field" @skip_ray @pytest.mark.asyncio async def test_llm_rpc_get_stats_async(): """Test that get_stats_async works with RPC orchestrator.""" import json with LLM(model=llama_model_path, kv_cache_config=global_kvcache_config, enable_iter_perf_stats=True, orchestrator_type="rpc") as llm: assert isinstance(llm._executor, GenerationExecutorRpcProxy) # Generate some output to produce stats async for output in llm.generate_async( prompts[0], sampling_params=SamplingParams(max_tokens=5)): print(output) # Get stats via async API stats_result = llm.get_stats_async(timeout=2) # Should be able to iterate over results stats_count = 0 async for stat in stats_result: parsed = json.loads(stat) if isinstance(stat, str) else stat assert "iter" in parsed, "Stats should contain 'iter' field" stats_count += 1 if stats_count >= 1: break # Just verify we can get at least one assert stats_count > 0, "Should have received at least one stat" @pytest.mark.threadleak(enabled=False) @pytest.mark.part0 @skip_ray def test_llm_context_only_timed_out(): tp_size = 1 use_overlap = False enable_iter_req_stats = False llm_args_extra = {} llm_args_extra.update( dict(enable_iter_perf_stats=True, enable_iter_req_stats=enable_iter_req_stats, disable_overlap_scheduler=not use_overlap)) llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config, tensor_parallel_size=tp_size, cache_transceiver_config=CacheTransceiverConfig( backend="UCX", kv_transfer_timeout_ms=1000), **llm_args_extra) max_tokens = 1 sampling_params = SamplingParams(max_tokens=max_tokens) disaggregated_params = DisaggregatedParams(request_type="context_only") prompts0 = [ "What is your name?", ] prompts1 = [ "Nvidia is awesome because", ] # Send context-only request for output in llm.generate(prompts1, sampling_params=sampling_params, disaggregated_params=disaggregated_params): print(output) max_retries = 10 for _ in range(max_retries): results = llm.get_stats(2) if len(results) == 1: break time.sleep(1) else: pytest.fail( f"Failed to get stats with len==1 after {max_retries} retries") assert len(results) == 1 context_only_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"] print(f"Context only used num blocks: {context_only_used_num_blocks}") # Sleep 5 seconds to allow context only request to time out time.sleep(5) # Send regular request for output in llm.generate(prompts0, sampling_params=sampling_params): print(output) # Get number of allocated blocks results = llm.get_stats(2) assert len(results) == 1 final_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"] assert final_used_num_blocks == 0 # This test is to verify that when the KV cache is exhausted and scheduled batch size is 0, the context only request will be aborted due to timeout. @pytest.mark.threadleak(enabled=False) @pytest.mark.part0 @skip_ray @pytest.mark.parametrize("sender_future_timeout_ms", [100, 1000]) @pytest.mark.parametrize("backend", ["NIXL", "UCX"]) def test_llm_context_only_timed_out_kv_cache_exhausted(sender_future_timeout_ms, backend): tp_size = 1 use_overlap = False enable_iter_req_stats = False llm_args_extra = {} llm_args_extra.update( dict(enable_iter_perf_stats=True, enable_iter_req_stats=enable_iter_req_stats, disable_overlap_scheduler=not use_overlap)) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.1, max_tokens=1000, enable_block_reuse=False) llm = LLM( model=llama_model_path, kv_cache_config=kv_cache_config, tensor_parallel_size=tp_size, cache_transceiver_config=CacheTransceiverConfig( backend=backend, kv_transfer_timeout_ms=1000, kv_transfer_sender_future_timeout_ms=sender_future_timeout_ms), **llm_args_extra) max_tokens = 1 sampling_params = SamplingParams(max_tokens=max_tokens) disaggregated_params = DisaggregatedParams(request_type="context_only") prompts0 = [ "What is your name?", ] prompts1 = [ "lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et dolore magna aliqua " * 10 ] # Send context-only request for output in llm.generate(prompts1 * 10, sampling_params=sampling_params, disaggregated_params=disaggregated_params): print(output) max_retries = 10 all_results = [] for _ in range(max_retries): results = llm.get_stats(2) all_results.extend(results) assert len(all_results) > 0 context_only_used_num_blocks = all_results[-1]["kvCacheStats"][ "usedNumBlocks"] print(f"Context only used num blocks: {context_only_used_num_blocks}") # Sleep 5 seconds to allow context only request to time out time.sleep(5) # Send regular request for output in llm.generate(prompts0, sampling_params=sampling_params): print(output) # Get number of allocated blocks results = llm.get_stats(2) assert len(results) == 1 final_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"] assert final_used_num_blocks == 0