TensorRT-LLMs/tests/unittest/llmapi/test_llm_pytorch.py

import json
import random
import time
from contextlib import contextmanager, nullcontext
from typing import Optional

import pytest

from tensorrt_llm import LLM
from tensorrt_llm.disaggregated_params import DisaggregatedParams
from tensorrt_llm.executor import GenerationExecutorWorker
from tensorrt_llm.executor.rpc_proxy import GenerationExecutorRpcProxy
from tensorrt_llm.llmapi import CacheTransceiverConfig, KvCacheConfig
from tensorrt_llm.llmapi.llm_args import NGramDecodingConfig, PeftCacheConfig
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
from tensorrt_llm.metrics import MetricNames
from tensorrt_llm.sampling_params import SamplingParams

# isort: off
from .lora_test_utils import (
    check_llama_7b_multi_lora_from_request_test_harness,
    check_llama_7b_multi_unique_lora_adapters_from_request,
    create_mock_nemo_lora_checkpoint)
from .test_llm import (_test_llm_capture_request_error, get_model_path,
                       global_kvcache_config, llama_model_path,
                       llm_get_stats_async_test_harness,
                       llm_get_stats_test_harness,
                       llm_return_logprobs_test_harness, llm_test_harness,
                       prompts, run_llm_abort_request,
                       run_llm_with_postprocess_parallel_and_result_handler,
                       tinyllama_logits_processor_test_harness)
from utils.util import (force_ampere, similar, skip_fp8_pre_ada,
                        skip_gpu_memory_less_than_40gb,
                        skip_gpu_memory_less_than_80gb,
                        skip_gpu_memory_less_than_138gb, skip_ray)
from utils.llm_data import llm_models_root
from tensorrt_llm.lora_helper import LoraConfig
from tensorrt_llm.executor.request import LoRARequest
import tempfile

import torch
from peft import LoraConfig as PeftLoraConfig
from peft import get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer

# isort: on


@force_ampere
@pytest.mark.parametrize("enable_chunked_prefill,", [False, True])
@pytest.mark.part2
def test_tinyllama_logits_processor(enable_chunked_prefill):
    tinyllama_logits_processor_test_harness(
        backend="pytorch", enable_chunked_prefill=enable_chunked_prefill)


@skip_ray
@pytest.mark.parametrize(
    "return_context_logits, use_overlap, enable_chunked_prefill, enable_iter_req_stats",
    [
        (False, False, False, True),
        (False, False, True, True),
        (False, True, False, True),
        (False, True, True, True),
    ])
@pytest.mark.part0
def test_llm_get_stats(return_context_logits, use_overlap,
                       enable_chunked_prefill, enable_iter_req_stats):
    llm_get_stats_test_harness(tp_size=1,
                               pp_size=1,
                               return_context_logits=return_context_logits,
                               pytorch_backend=True,
                               use_overlap=use_overlap,
                               enable_chunked_prefill=enable_chunked_prefill,
                               enable_iter_req_stats=enable_iter_req_stats)


@skip_ray
@pytest.mark.parametrize(
    "return_context_logits, use_overlap, enable_chunked_prefill, enable_iter_req_stats",
    [
        (False, False, False, True),
        (False, False, True, True),
        (False, True, False, True),
        (False, True, True, True),
    ])
@pytest.mark.part1
def test_llm_get_stats_async(return_context_logits, use_overlap,
                             enable_chunked_prefill, enable_iter_req_stats):
    llm_get_stats_async_test_harness(
        tp_size=1,
        pp_size=1,
        return_context_logits=return_context_logits,
        pytorch_backend=True,
        use_overlap=use_overlap,
        enable_chunked_prefill=enable_chunked_prefill,
        enable_iter_req_stats=enable_iter_req_stats)


@pytest.mark.part1
def test_llm_capture_request_error():
    _test_llm_capture_request_error(pytorch_backend=True, tp_size=1)


@force_ampere
@pytest.mark.mpi_ray_parity
@pytest.mark.parametrize(
    "sampling_params",
    [
        SamplingParams()  # pytorch only supports n=1
    ])
@pytest.mark.part0
def test_llm_abort_request(sampling_params):
    llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
    run_llm_abort_request(llm=llm, sampling_params=sampling_params)


@contextmanager
def _validate_invalid_token_error_scope():
    with pytest.raises(RuntimeError) as exc_info:
        yield
    assert "Token ID out of range" in str(exc_info.value)


@force_ampere
@pytest.mark.part1
def test_llm_invalid_input_token():
    llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
    prompts = [
        [-1],
    ]
    # NB: exc_info in _validate_invalid_token_error_scope creates a reference
    #     to a traceback which outlives the scope of 'exc_info' and prevents
    #     deletion of 'llm'. However, using the context manager protocol is
    #     anyways more robust than delegating cleanup to __del__.
    with llm:
        with _validate_invalid_token_error_scope():
            llm.generate(
                prompts,
                sampling_params=SamplingParams(max_tokens=5),
            )


@force_ampere
@pytest.mark.part0
def test_llm_invalid_input_token_async():
    llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
    # NB: exc_info in _validate_invalid_token_error_scope creates a reference
    #     to a traceback which outlives the scope of 'exc_info' and prevents
    #     deletion of 'llm'. However, using the context manager protocol is
    #     anyways more robust than delegating cleanup to __del__.
    with llm:
        prompts = [
            [-1],
            [42],
        ]
        fail_idx = [0]
        for submit_order in [[0, 1], [1, 0]]:
            for collect_order in [[0, 1], [1, 0]]:
                print(f"submitting {submit_order}")
                futures = [
                    llm.generate_async(
                        prompts[submit_idx],
                        sampling_params=SamplingParams(max_tokens=5),
                    ) for submit_idx in submit_order
                ]
                for collect_idx in collect_order:
                    with _validate_invalid_token_error_scope(
                    ) if submit_order[collect_idx] in fail_idx else nullcontext(
                    ):
                        print(
                            f"collect order {collect_order}, collecting {collect_idx}"
                        )
                        futures[collect_idx].result()


@pytest.mark.part2
def test_llm_reward_model():
    rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B")
    tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
    tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"]

    llm = LLM(model=rm_model_path,
              attn_backend="VANILLA",
              disable_overlap_scheduler=True)

    sampling_params = SamplingParams(return_context_logits=True)

    outputs = llm.generate(prompts, sampling_params)
    scores = outputs[0].context_logits

    print(scores)

    assert scores.shape == (tokenized_input.shape[1], 2)
    assert not outputs[0].outputs[0].text


@skip_ray
@pytest.mark.part3
def test_llm_perf_metrics():
    with LLM(model=llama_model_path,
             kv_cache_config=global_kvcache_config) as llm:
        sampling_params = SamplingParams(max_tokens=10,
                                         return_perf_metrics=True)
        outputs = llm.generate(prompts, sampling_params)
        assert outputs[0].outputs[0].request_perf_metrics is not None

        perf_metrics = outputs[0].outputs[0].request_perf_metrics

        timing_metrics = perf_metrics.timing_metrics
        assert timing_metrics.arrival_time < timing_metrics.first_scheduled_time
        assert timing_metrics.first_scheduled_time < timing_metrics.first_token_time
        assert timing_metrics.first_token_time < timing_metrics.last_token_time

        kv_cache_metrics = perf_metrics.kv_cache_metrics
        assert kv_cache_metrics.num_total_allocated_blocks == 1
        assert kv_cache_metrics.num_new_allocated_blocks == 1
        assert kv_cache_metrics.num_reused_blocks == 0
        assert kv_cache_metrics.num_missed_blocks == 1
        assert kv_cache_metrics.kv_cache_hit_rate == 0

        assert perf_metrics.first_iter is not None
        assert perf_metrics.iter - perf_metrics.first_iter == sampling_params.max_tokens - 1
        assert perf_metrics.last_iter == perf_metrics.iter


@skip_ray
@pytest.mark.part3
def test_llm_prometheus():
    test_prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    sampling_params = SamplingParams(max_tokens=10, temperature=0.8, top_p=0.95)
    llm = LLM(model=llama_model_path,
              return_perf_metrics=True,
              kv_cache_config=global_kvcache_config)
    for test_prompt in test_prompts:
        request_output = llm.generate(test_prompt, sampling_params)
        assert request_output.metrics_dict is not None
        assert MetricNames.REQUEST_QUEUE_TIME in request_output.metrics_dict
        assert MetricNames.TPOT in request_output.metrics_dict
        assert MetricNames.TTFT in request_output.metrics_dict
        assert MetricNames.E2E in request_output.metrics_dict
        assert request_output.outputs is not None


@skip_ray
@pytest.mark.parametrize("streaming", [True, False])
@pytest.mark.part3
def test_llm_with_postprocess_parallel_and_result_handler(streaming):
    run_llm_with_postprocess_parallel_and_result_handler(streaming,
                                                         "pytorch",
                                                         tp_size=1)


@pytest.mark.part0
def test_embedding_bias_with_torch_sampler_strategies():
    """Test embedding bias application in TorchSampler."""
    tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
    biased_word_id = tokenizer.encode("Z", add_special_tokens=False)[-1]
    vocab_size_padded = 32000
    embedding_bias = torch.zeros(vocab_size_padded)
    embedding_bias[biased_word_id] = torch.finfo(torch.float32).max

    sampling_kwargs = {
        "max_tokens": 6,
        "embedding_bias": embedding_bias,
    }

    # All test cases use greedy sampling for simplicity

    sampling_params = SamplingParams(**sampling_kwargs)

    llm_test_harness(
        llama_model_path,
        prompts,
        ["Z Z Z Z Z Z"],
        sampling_params=sampling_params,
        backend="pytorch",
    )


def llama_7b_lora_from_dir_test_harness(**llm_kwargs) -> None:
    lora_config = LoraConfig(
        lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"],
        max_lora_rank=8,
        max_loras=2,
        max_cpu_loras=2)
    llm = LLM(
        model=f"{llm_models_root()}/llama-models/llama-7b-hf",
        lora_config=lora_config,
        # Disable CUDA graph
        # TODO: remove this once we have a proper fix for CUDA graph in LoRA
        cuda_graph_config=None,
        **llm_kwargs)
    try:
        prompts = [
            "美国的首都在哪里? \n答案:",
        ]
        references = [
            "美国的首都是华盛顿。\n\n美国的",
        ]
        sampling_params = SamplingParams(max_tokens=20)
        lora_req = LoRARequest(
            "task-0", 0, f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1")
        lora_request = [lora_req]

        outputs = llm.generate(prompts,
                               sampling_params,
                               lora_request=lora_request)
        assert similar(outputs[0].outputs[0].text, references[0])
    finally:
        llm.shutdown()


@skip_gpu_memory_less_than_40gb
@pytest.mark.part0
def test_llama_7b_lora():
    llama_7b_lora_from_dir_test_harness()


@skip_gpu_memory_less_than_40gb
def test_llama_7b_lora_default_modules() -> None:
    lora_config = LoraConfig(max_lora_rank=64, max_loras=2, max_cpu_loras=2)

    hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf"

    llm = LLM(
        model=hf_model_dir,
        lora_config=lora_config,
        # Disable CUDA graph
        # TODO: remove this once we have a proper fix for CUDA graph in LoRA
        cuda_graph_config=None)

    hf_lora_dir = f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"
    try:
        prompts = [
            "美国的首都在哪里? \n答案:",
        ]
        references = [
            "美国的首都是华盛顿。\n\n美国的",
        ]
        sampling_params = SamplingParams(max_tokens=20,
                                         add_special_tokens=False)
        lora_req = LoRARequest("luotuo", 1, hf_lora_dir)
        lora_request = [lora_req]

        outputs = llm.generate(prompts,
                               sampling_params,
                               lora_request=lora_request)

        assert similar(outputs[0].outputs[0].text, references[0])
    finally:
        llm.shutdown()


def _check_llama_7b_multi_lora_evict_load_new_adapters(
        lora_adapter_count_per_call: list[int], max_loras: int,
        max_cpu_loras: int, repeat_calls: int, repeats_per_call: int):
    # For LoRA checkpoints without finetuned embedding and lm_head, we can either:
    # (1) specify lora_target_modules, or
    # (2) provide a lora_dir to infer the lora_target_modules.
    lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'],
                             max_lora_rank=8,
                             max_loras=max_loras,
                             max_cpu_loras=max_cpu_loras)
    check_llama_7b_multi_unique_lora_adapters_from_request(
        lora_adapter_count_per_call,
        repeat_calls,
        repeats_per_call,
        LLM,
        lora_config=lora_config,
        # Disable CUDA graph
        # TODO: remove this once we have a proper fix for CUDA graph in LoRA
        cuda_graph_config=None)


@skip_gpu_memory_less_than_40gb
@skip_ray  # https://nvbugs/5682551
@pytest.mark.part3
def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():
    """Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single
    llm.generate call, that's repeated twice.
    """  # noqa: D205
    _check_llama_7b_multi_lora_evict_load_new_adapters(
        lora_adapter_count_per_call=[2],
        max_loras=1,
        max_cpu_loras=2,
        repeat_calls=2,
        repeats_per_call=3)


@skip_gpu_memory_less_than_40gb
@pytest.mark.part1
def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():
    """Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU
    cache size < LoRA CPU cache size.
    """  # noqa: D205
    _check_llama_7b_multi_lora_evict_load_new_adapters(
        lora_adapter_count_per_call=[2, 2, 2],
        max_loras=1,
        max_cpu_loras=3,
        repeat_calls=1,
        repeats_per_call=1)


@skip_gpu_memory_less_than_40gb
@pytest.mark.part0
def test_llama_7b_multi_lora_read_from_cache_after_insert():
    """Test that loading and then using the same adapters loaded in cache works."""
    _check_llama_7b_multi_lora_evict_load_new_adapters(
        lora_adapter_count_per_call=[3],
        max_loras=3,
        max_cpu_loras=3,
        repeat_calls=2,
        repeats_per_call=1)


@skip_gpu_memory_less_than_40gb
@pytest.mark.part3
def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_cache(
):
    """Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU
    cache over multiple llm.generate call repeated twice (two calls with the same requests):
    At the end of the 1st llm.generate call:
      The LoRA caches should contain adapters 1, 2 and shouldn't contain adapter 0 (it should have been evicted).
    So in the 2nd call, the worker should:
    - Send req0 with adapter 0 weights (because it was previously evicted)
    - Send the other two requests without their adapter weights as they're already in LoRA CPU cache
    Then, handling of req0 that has weights but not in the cache should evict one of the other two adapters from
    the cache, causing that evicted adapter's request to again load its weights from the file system, as they
    aren't with the request and aren't in LoRA cache.
    """  # noqa: D205
    _check_llama_7b_multi_lora_evict_load_new_adapters(
        lora_adapter_count_per_call=[3],
        max_loras=2,
        max_cpu_loras=2,
        repeat_calls=2,
        repeats_per_call=1)


@skip_gpu_memory_less_than_40gb
@pytest.mark.part2
def test_llama_7b_peft_cache_config_affects_peft_cache_size():
    """Tests that LLM arg of peft_cache_config affects the peft cache sizes.

    NOTE: The caller can't get the actual LoRA cache sizes, so we instead we
    test that it fails when configured with a value too small to contain a
    single adapter.
    """
    # For LoRA checkpoints without finetuned embedding and lm_head, we can either:
    # (1) specify lora_target_modules, or
    # (2) provide a lora_dir to infer the lora_target_modules.
    lora_config_no_cache_size_values = LoraConfig(
        lora_target_modules=['attn_q', 'attn_k', 'attn_v'], max_lora_rank=8)

    # Test that too small PeftCacheConfig.host_cache_size causes failure
    with pytest.raises(RuntimeError):
        check_llama_7b_multi_lora_from_request_test_harness(
            LLM,
            lora_config=lora_config_no_cache_size_values,
            peft_cache_config=PeftCacheConfig(
                host_cache_size=1),  # size in bytes
            # Disable CUDA graph
            # TODO: remove this once we have a proper fix for CUDA graph in LoRA
            cuda_graph_config=None)

    # Test that too small PeftCacheConfig.device_cache_percent causes failure
    with pytest.raises(RuntimeError):
        check_llama_7b_multi_lora_from_request_test_harness(
            LLM,
            lora_config=lora_config_no_cache_size_values,
            peft_cache_config=PeftCacheConfig(device_cache_percent=0.0000001),
            # Disable CUDA graph
            # TODO: remove this once we have a proper fix for CUDA graph in LoRA
            cuda_graph_config=None)


@skip_ray  # https://nvbugs/5682551
@skip_gpu_memory_less_than_40gb
@pytest.mark.part1
def test_llama_7b_lora_config_overrides_peft_cache_config():
    """Tests that cache size args in lora_config LLM arg override the cache size
    parameters in peft_cache_config LLM arg.
    """    # noqa: D205
    check_llama_7b_multi_lora_from_request_test_harness(
        LLM,
        lora_config=LoraConfig(
            lora_target_modules=['attn_q', 'attn_k', 'attn_v'],
            max_lora_rank=8,
            max_loras=2,
            max_cpu_loras=2),
        peft_cache_config=PeftCacheConfig(
            host_cache_size=1,  # size in bytes
            device_cache_percent=0.0000001),
        # Disable CUDA graph
        # TODO: remove this once we have a proper fix for CUDA graph in LoRA
        cuda_graph_config=None)


# TODO smor: currently Nemotron-Super-49B-v1 with LoRA memory consumption is overly high
# https://jirasw.nvidia.com/browse/TRTLLM-5045
@pytest.mark.skip(reason="https://nvbugs/5448464")
@skip_gpu_memory_less_than_138gb
@pytest.mark.part1
def test_nemotron_nas_lora() -> None:
    lora_config = LoraConfig(lora_dir=[
        f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64"
    ],
                             max_lora_rank=64,
                             max_loras=1,
                             max_cpu_loras=1)

    llm = LLM(
        model=
        f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1",
        lora_config=lora_config,
    )

    prompts = [
        "Hello, how are you?",
        "Hello, how are you?",
    ]

    sampling_params = SamplingParams(max_tokens=10, add_special_tokens=False)
    lora_req = LoRARequest(
        "task-0", 0,
        f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64"
    )
    lora_request = [lora_req, None]

    outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)

    assert similar(outputs[0].outputs[0].text, outputs[1].outputs[0].text)


@skip_gpu_memory_less_than_80gb
@pytest.mark.part0
def test_llama_3_1_8b_fp8_with_bf16_lora() -> None:
    skip_fp8_pre_ada(use_fp8=True)
    model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
    lora_dir = f"{llm_models_root()}/lora/llama-3-chinese-8b-instruct-v2-lora"
    prompt = "美国的首都是哪里？"
    reference = "华盛顿特区。华盛顿特区是美国的首都和一个行政区"

    lora_config = LoraConfig(lora_dir=[lora_dir],
                             max_lora_rank=64,
                             max_loras=2,
                             max_cpu_loras=2)
    lora_req = LoRARequest("lora-chinese", 0, lora_dir)

    llm = LLM(
        model_dir,
        lora_config=lora_config,
        # Disable CUDA graph
        # TODO: remove this once we have a proper fix for CUDA graph in LoRA
        cuda_graph_config=None)

    try:
        output = llm.generate(prompt,
                              SamplingParams(max_tokens=20),
                              lora_request=[lora_req])
    finally:
        llm.shutdown()
    assert similar(output.outputs[0].text, reference)


@skip_gpu_memory_less_than_80gb
@pytest.mark.part2
def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
    model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"

    target_modules = ['attn_q', 'attn_k', 'attn_v']

    # Set up temporary directory for LoRA adapters
    with tempfile.TemporaryDirectory() as lora_dir:
        print("Creating dummy LoRAs...")

        model = AutoModelForCausalLM.from_pretrained(model_dir,
                                                     dtype=torch.bfloat16,
                                                     device_map="auto")
        hf_modules = ["q_proj", "k_proj", "v_proj"]
        peft_lora_config = PeftLoraConfig(r=8,
                                          target_modules=hf_modules,
                                          bias="none",
                                          task_type="CAUSAL_LM")
        lora_paths = []
        for i in range(2):
            lora_model = get_peft_model(model, peft_lora_config)
            for param in lora_model.parameters():
                param.data.zero_()
            lora_path = f"{lora_dir}/lora_{i}"
            lora_model.save_pretrained(lora_path)
            lora_paths.append(lora_path)

        trtllm_lora_config = LoraConfig(lora_target_modules=target_modules,
                                        max_lora_rank=8,
                                        max_loras=2,
                                        max_cpu_loras=2)
        llm = LLM(
            model_dir,
            lora_config=trtllm_lora_config,
            # Disable CUDA graph
            # TODO: remove this once we have a proper fix for CUDA graph in LoRA
            cuda_graph_config=None)

        prompts = [
            "Kim był Mikołaj Kopernik i z czego zasłynął?",
            "Gdzie znajduje się stolica Polski?",
        ]
        lora_req1 = LoRARequest("lora-1", 0, lora_paths[0])
        lora_req2 = LoRARequest("lora-2", 1, lora_paths[1])
        lora_requests = [lora_req1, lora_req2]
        sampling_params = SamplingParams(max_tokens=200)

        outputs = llm.generate(prompts,
                               sampling_params,
                               lora_request=lora_requests)

        assert len(outputs) == 2


@pytest.mark.part2
def test_gemma3_1b_instruct_multi_lora() -> None:
    model_dir = f"{llm_models_root()}/gemma/gemma-3-1b-it"

    target_modules = ['attn_q', 'attn_k', 'attn_v']

    # Set up temporary directory for LoRA adapters
    with tempfile.TemporaryDirectory() as lora_dir:
        print("Creating dummy LoRAs...")

        model = AutoModelForCausalLM.from_pretrained(model_dir,
                                                     dtype=torch.bfloat16,
                                                     device_map="auto")
        hf_modules = ["q_proj", "k_proj", "v_proj"]
        peft_lora_config = PeftLoraConfig(r=8,
                                          target_modules=hf_modules,
                                          bias="none",
                                          task_type="CAUSAL_LM")
        lora_paths = []
        for i in range(2):
            lora_model = get_peft_model(model, peft_lora_config)
            for param in lora_model.parameters():
                param.data.zero_()
            lora_path = f"{lora_dir}/lora_{i}"
            lora_model.save_pretrained(lora_path)
            lora_paths.append(lora_path)

        trtllm_lora_config = LoraConfig(lora_dir=lora_paths,
                                        lora_target_modules=target_modules,
                                        max_lora_rank=8,
                                        max_loras=2,
                                        max_cpu_loras=2)
        # Disabling kv cache reuse as a WAR to deal with gaps in kernel support for Gemma3's non-inclusive sliding window size.
        kv_cache_config = KvCacheConfig(
            enable_block_reuse=False,
            enable_partial_reuse=False,
        )
        llm = LLM(model_dir,
                  lora_config=trtllm_lora_config,
                  kv_cache_config=kv_cache_config)

        prompts = [
            "Is it ok to fill diesel in a petrol car?",
            "What is the capital of France?",
        ]
        lora_req1 = LoRARequest("lora-1", 0, lora_paths[0])
        lora_req2 = LoRARequest("lora-2", 1, lora_paths[1])
        lora_requests = [lora_req1, lora_req2]
        sampling_params = SamplingParams(max_tokens=200)

        outputs = llm.generate(prompts,
                               sampling_params,
                               lora_request=lora_requests)

        assert len(outputs) == 2


@pytest.mark.parametrize(
    "lora_rank,max_lora_rank,description",
    [
        # (lora_rank, max_lora_rank, description)
        (8, 8, "rank_8"),
        (16, 16, "rank_16"),
        (4, 8, "rank_4_max_8"),
    ])
@pytest.mark.part3
def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank,
                                       description):
    """Test load_torch_nemo_lora function with different LoRA rank configurations."""
    from tensorrt_llm.lora_manager import load_torch_nemo_lora

    nemo_path = create_mock_nemo_lora_checkpoint(
        tmp_path,
        hidden_size=2048,
        num_layers=16,
        lora_rank=lora_rank,
    )

    lora_config = LoraConfig(
        lora_dir=[str(nemo_path)],
        lora_ckpt_source="nemo",
        max_lora_rank=max_lora_rank,
    )

    # This should not raise an error
    load_torch_nemo_lora(lora_config)

    assert lora_config.lora_target_modules == [
        "attn_qkv"
    ], f"Expected attn_qkv modules for {description}"
    assert lora_config.trtllm_modules_to_hf_modules == {
        "attn_qkv": "attn_qkv"
    }, f"Expected correct module mapping for {description}"


@pytest.mark.part0
def test_nemo_lora_unsupported_modules_validation(tmp_path):
    """Test validation of unsupported modules in NeMo LoRA."""
    from tensorrt_llm.lora_manager import load_torch_nemo_lora

    nemo_path = create_mock_nemo_lora_checkpoint(
        tmp_path,
        hidden_size=2048,
        num_layers=16,
        lora_rank=8,
    )

    # Test validation: should fail with unsupported modules
    invalid_config = LoraConfig(
        lora_dir=[str(nemo_path)],
        lora_ckpt_source="nemo",
        lora_target_modules=["attn_qkv",
                             "mlp_h_to_4h"],  # mlp_h_to_4h not supported
        max_lora_rank=8,
    )

    with pytest.raises(ValueError, match="NeMo LoRA only supports"):
        load_torch_nemo_lora(invalid_config)


@force_ampere
@pytest.mark.part1
def test_gqa_nemo_lora(tmp_path):
    """
    Test NeMo-format LoRA checkpoint loading and GQA support in TinyLlama.

    This test verifies two properties:
    1. That a NeMo-format LoRA checkpoint with GQA (grouped query attention) can be loaded and applied to a TinyLlama model,
       and that generation with this LoRA produces a deterministic, expected output for a fixed prompt and temperature=0.0.
    2. That the LoRA weights have a significant effect: generating with LoRA produces a different output than generating
       without LoRA, confirming that the LoRA adapter is actually being applied.

    The test uses a deterministic dummy LoRA checkpoint (seed=42) and checks both the positive (LoRA applied) and negative
    (no LoRA) cases for output text.
    """
    # TinyLlama's exact GQA configuration
    hidden_size = 2048
    num_layers = 22
    num_q_heads = 32  # Query attention heads
    num_kv_heads = 4  # Key/Value heads (GQA)
    lora_rank = 8

    nemo_path = create_mock_nemo_lora_checkpoint(
        tmp_path,
        hidden_size=hidden_size,
        num_layers=num_layers,
        lora_rank=lora_rank,
        num_attention_heads=num_q_heads,
        num_kv_heads=num_kv_heads,
        seed=42,  # NOTE: the seed=42 is important for the test to pass.
    )
    expected_lora_text_output = "Paris. The capital of France is Paris. The"
    test_prompts = ["The capital of France is"]
    sampling_params = SamplingParams(max_tokens=10, temperature=0.0)

    lora_config = LoraConfig(
        lora_dir=[str(nemo_path)],
        lora_ckpt_source="nemo",
        max_lora_rank=lora_rank,
    )

    model_path = get_model_path("llama-models-v2/TinyLlama-1.1B-Chat-v1.0")

    llm = LLM(
        model=model_path,
        lora_config=lora_config,
        kv_cache_config=global_kvcache_config,
    )

    try:
        lora_req = LoRARequest("tinyllama-gqa-test",
                               0,
                               str(nemo_path),
                               lora_ckpt_source="nemo")

        lora_outputs = llm.generate(test_prompts,
                                    sampling_params,
                                    lora_request=[lora_req])

        # For the above deterministic dummy LoRA checkpoint,
        # with temperature=0.0,
        # the expected output text should always be the same.
        assert lora_outputs[0].outputs[0].text == expected_lora_text_output, \
            f"Expected output text: {expected_lora_text_output}, " \
            f"got: {lora_outputs[0].outputs[0].text}"
        assert len(lora_outputs) == 1

        # Generate without LoRA.
        # The LoRA weights are tuned/large enough that
        # they differ from a no-LoRA run.
        base_outputs = llm.generate(test_prompts, sampling_params)
        assert base_outputs[0].outputs[0].text != expected_lora_text_output, \
            f"No-LoRA output should differ from expected output text: {expected_lora_text_output}, " \
            f"got: {base_outputs[0].outputs[0].text}"
    finally:
        llm.shutdown()


class TestLlmError:

    @pytest.mark.part3
    def test_max_num_token_check(self):
        """ LLM should raise error when got prompt length exceed the valid range. """
        llm = LLM(llama_model_path,
                  kv_cache_config=global_kvcache_config,
                  max_num_tokens=100)

        with pytest.raises(ValueError,
                           match="should not exceed max_num_tokens"):
            ids = [random.randint(10, 100) for _ in range(101)]
            llm.generate([ids])


class FailingExecutorWorker(GenerationExecutorWorker):
    """Mock worker that fails during initialization to test error handling."""

    def __init__(self, *args, **kwargs):
        # Simulate a constructor failure
        raise RuntimeError(
            "Mock GenerationExecutorWorker initialization failed")


FailingExecutor = type(
    "FailingExecutor", (), {
        "create":
        classmethod(
            lambda cls, *args, **kwargs: FailingExecutorWorker(*args, **kwargs))
    })


@skip_ray
@pytest.mark.part2
def test_llm_with_proxy_error():
    """Test that LLM properly handles GenerationExecutorWorker constructor failures.

    This test mocks the GenerationExecutorWorker to fail during __init__ and
    verifies that the LLM class properly catches and re-raises the error.
    """
    from unittest.mock import patch

    # Test that the error is properly caught and re-raised by LLM
    # We patch GenerationExecutor.create directly to return our failing worker
    with patch('tensorrt_llm.executor.executor.GenerationExecutor.create',
               side_effect=lambda *args, **kwargs: FailingExecutorWorker(
                   *args, **kwargs)):
        with pytest.raises(
                RuntimeError,
                match="Mock GenerationExecutorWorker initialization failed"):
            llm = LLM(model=llama_model_path,
                      kv_cache_config=global_kvcache_config)


@pytest.mark.part0
@pytest.mark.parametrize("use_speculative", [True, False])
def test_min_tokens(use_speculative: bool):
    """Check min_tokens is respected."""
    llm_common_config = dict(
        model=llama_model_path,
        max_batch_size=2,
        kv_cache_config=global_kvcache_config,
        max_num_tokens=2048,
    )

    if use_speculative:
        spec_config = NGramDecodingConfig(
            max_draft_len=4,
            max_matching_ngram_size=2,
            is_keep_all=True,
            is_use_oldest=True,
            is_public_pool=True,
        )
        llm = LLM(**llm_common_config, speculative_config=spec_config)
    else:
        llm = LLM(**llm_common_config)

    output_len = 2000
    sampling_params = SamplingParams(max_tokens=output_len,
                                     min_tokens=output_len,
                                     temperature=1)
    res = llm.generate("The end.", sampling_params=sampling_params)

    assert len(res.outputs) == 1
    assert len(res.outputs[0].token_ids) == output_len


@skip_ray
@pytest.mark.parametrize(
    "prompt_logprobs, logprobs, return_context_logits, return_generation_logits, backend",
    [
        (2, None, True, False,
         "pytorch"),  # prompt_logprobs with context_logits
        (None, 1, False, False,
         "pytorch"),  # generation logprobs only (top-1, PyTorch limit)
        (2, None, False, False,
         "pytorch"),  # prompt_logprobs without context_logits
        (None, None, False, False, "pytorch"),  # no logprobs at all
    ])
def test_llm_return_logprobs(prompt_logprobs: Optional[int],
                             logprobs: Optional[int],
                             return_context_logits: bool,
                             return_generation_logits: bool, backend: str):
    llm_return_logprobs_test_harness(prompt_logprobs,
                                     logprobs,
                                     return_context_logits,
                                     return_generation_logits,
                                     backend=backend)


@skip_ray
@pytest.mark.parametrize(
    "prompt_logprobs, logprobs, return_context_logits, return_generation_logits",
    [
        (None, 1, False,
         False),  # generation logprobs only (top-1, PyTorch limit)
        (2, None, True, False),  # prompt_logprobs with context_logits
        (2, None, False, False),  # prompt_logprobs only
        (2, 1, False, False),  # both prompt and generation logprobs
        (2, 3, False, False),  # both prompt and generation logprobs
    ])
def test_llm_return_logprobs_streaming(prompt_logprobs, logprobs,
                                       return_context_logits,
                                       return_generation_logits):
    llm_return_logprobs_test_harness(prompt_logprobs,
                                     logprobs,
                                     return_context_logits,
                                     return_generation_logits,
                                     streaming=True,
                                     backend="pytorch")


class TestLlmError:

    @pytest.mark.part3
    def test_max_num_token_check(self):
        """ LLM should raise error when got prompt length exceed the valid range. """
        llm = LLM(llama_model_path,
                  kv_cache_config=global_kvcache_config,
                  max_num_tokens=100)

        with pytest.raises(ValueError,
                           match="should not exceed max_num_tokens"):
            ids = [random.randint(10, 100) for _ in range(101)]
            llm.generate([ids])


@skip_ray
@pytest.mark.parametrize("num_requests", [1, 5, 10])
def test_llm_rpc(num_requests: int):
    # TODO: remove the with-statement when shutdown hang issue is fixed
    with LLM(model=llama_model_path,
             kv_cache_config=global_kvcache_config,
             orchestrator_type="rpc") as llm:
        assert isinstance(llm._executor, GenerationExecutorRpcProxy)

        res = llm.generate("Tell me a joke",
                           sampling_params=SamplingParams(max_tokens=10,
                                                          end_id=-1))
        print(f"get result: {res}")

        assert len(res.outputs) == 1
        assert len(res.outputs[0].token_ids) == 10


@skip_ray
@pytest.mark.asyncio
async def test_llm_rpc_streaming():
    # TODO: remove the with-statement when shutdown hang issue is fixed
    with LLM(model=llama_model_path,
             kv_cache_config=global_kvcache_config,
             orchestrator_type="rpc") as llm:
        assert isinstance(llm._executor, GenerationExecutorRpcProxy)

        outputs = []
        async for output in llm.generate_async("Tell me a joke",
                                               sampling_params=SamplingParams(
                                                   max_tokens=10, end_id=-1),
                                               streaming=True):
            outputs.append(output.outputs[0].text)
        "".join(outputs)
        print(f"get result: {outputs}")


@skip_ray
def test_llm_rpc_get_stats():
    """Test that get_stats works with RPC orchestrator."""

    with LLM(model=llama_model_path,
             kv_cache_config=global_kvcache_config,
             enable_iter_perf_stats=True,
             orchestrator_type="rpc") as llm:
        assert isinstance(llm._executor, GenerationExecutorRpcProxy)

        # Generate some output to produce stats
        for output in llm.generate(
                prompts, sampling_params=SamplingParams(max_tokens=5)):
            print(output)

        stats = llm.get_stats(timeout=5)

        assert len(stats) > 0, "Should have at least one stats entry"
        # Stats should be JSON strings that can be parsed
        parsed = json.loads(stats[0]) if isinstance(stats[0], str) else stats[0]
        assert "iter" in parsed, "Stats should contain 'iter' field"
        assert "cpuMemUsage" in parsed, "Stats should contain 'cpuMemUsage' field"


@skip_ray
@pytest.mark.asyncio
async def test_llm_rpc_get_stats_async():
    """Test that get_stats_async works with RPC orchestrator."""
    import json

    with LLM(model=llama_model_path,
             kv_cache_config=global_kvcache_config,
             enable_iter_perf_stats=True,
             orchestrator_type="rpc") as llm:
        assert isinstance(llm._executor, GenerationExecutorRpcProxy)

        # Generate some output to produce stats
        async for output in llm.generate_async(
            prompts[0], sampling_params=SamplingParams(max_tokens=5)):
            print(output)

        # Get stats via async API
        stats_result = llm.get_stats_async(timeout=2)

        # Should be able to iterate over results
        stats_count = 0
        async for stat in stats_result:
            parsed = json.loads(stat) if isinstance(stat, str) else stat
            assert "iter" in parsed, "Stats should contain 'iter' field"
            stats_count += 1
            if stats_count >= 1:
                break  # Just verify we can get at least one

        assert stats_count > 0, "Should have received at least one stat"


@pytest.mark.threadleak(enabled=False)
@pytest.mark.part0
@skip_ray
def test_llm_context_only_timed_out():
    tp_size = 1
    use_overlap = False
    enable_iter_req_stats = False

    llm_args_extra = {}

    llm_args_extra.update(
        dict(enable_iter_perf_stats=True,
             enable_iter_req_stats=enable_iter_req_stats,
             disable_overlap_scheduler=not use_overlap))

    llm = LLM(model=llama_model_path,
              kv_cache_config=global_kvcache_config,
              tensor_parallel_size=tp_size,
              cache_transceiver_config=CacheTransceiverConfig(
                  backend="UCX", kv_transfer_timeout_ms=1000),
              **llm_args_extra)

    max_tokens = 1
    sampling_params = SamplingParams(max_tokens=max_tokens)

    disaggregated_params = DisaggregatedParams(request_type="context_only")

    prompts0 = [
        "What is your name?",
    ]
    prompts1 = [
        "Nvidia is awesome because",
    ]

    # Send context-only request
    for output in llm.generate(prompts1,
                               sampling_params=sampling_params,
                               disaggregated_params=disaggregated_params):
        print(output)

    max_retries = 10
    for _ in range(max_retries):
        results = llm.get_stats(2)
        if len(results) == 1:
            break
        time.sleep(1)
    else:
        pytest.fail(
            f"Failed to get stats with len==1 after {max_retries} retries")

    assert len(results) == 1
    context_only_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"]
    print(f"Context only used num blocks: {context_only_used_num_blocks}")

    # Sleep 5 seconds to allow context only request to time out
    time.sleep(5)

    # Send regular request
    for output in llm.generate(prompts0, sampling_params=sampling_params):
        print(output)

    # Get number of allocated blocks
    results = llm.get_stats(2)
    assert len(results) == 1
    final_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"]

    assert final_used_num_blocks == 0


# This test is to verify that when the KV cache is exhausted and scheduled batch size is 0, the context only request will be aborted due to timeout.


@pytest.mark.threadleak(enabled=False)
@pytest.mark.part0
@skip_ray
@pytest.mark.parametrize("sender_future_timeout_ms", [100, 1000])
@pytest.mark.parametrize("backend", ["NIXL", "UCX"])
def test_llm_context_only_timed_out_kv_cache_exhausted(sender_future_timeout_ms,
                                                       backend):
    tp_size = 1
    use_overlap = False
    enable_iter_req_stats = False

    llm_args_extra = {}

    llm_args_extra.update(
        dict(enable_iter_perf_stats=True,
             enable_iter_req_stats=enable_iter_req_stats,
             disable_overlap_scheduler=not use_overlap))

    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.1,
                                    max_tokens=1000,
                                    enable_block_reuse=False)
    llm = LLM(
        model=llama_model_path,
        kv_cache_config=kv_cache_config,
        tensor_parallel_size=tp_size,
        cache_transceiver_config=CacheTransceiverConfig(
            backend=backend,
            kv_transfer_timeout_ms=1000,
            kv_transfer_sender_future_timeout_ms=sender_future_timeout_ms),
        **llm_args_extra)

    max_tokens = 1
    sampling_params = SamplingParams(max_tokens=max_tokens)

    disaggregated_params = DisaggregatedParams(request_type="context_only")

    prompts0 = [
        "What is your name?",
    ]
    prompts1 = [
        "lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et dolore magna aliqua "
        * 10
    ]

    # Send context-only request
    for output in llm.generate(prompts1 * 10,
                               sampling_params=sampling_params,
                               disaggregated_params=disaggregated_params):
        print(output)

    max_retries = 10
    all_results = []
    for _ in range(max_retries):
        results = llm.get_stats(2)
        all_results.extend(results)

    assert len(all_results) > 0

    context_only_used_num_blocks = all_results[-1]["kvCacheStats"][
        "usedNumBlocks"]
    print(f"Context only used num blocks: {context_only_used_num_blocks}")

    # Sleep 5 seconds to allow context only request to time out
    time.sleep(5)

    # Send regular request
    for output in llm.generate(prompts0, sampling_params=sampling_params):
        print(output)

    # Get number of allocated blocks
    results = llm.get_stats(2)
    assert len(results) == 1
    final_used_num_blocks = results[0]["kvCacheStats"]["usedNumBlocks"]

    assert final_used_num_blocks == 0