TensorRT-LLMs/tests/unittest/_torch/speculative/test_eagle3.py

import json
import os
import sys
import tempfile
import unittest
from pathlib import Path
from unittest.mock import MagicMock

import pytest
import torch
from utils.llm_data import llm_models_root

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm._torch.attention_backend.trtllm import TrtllmAttentionMetadata
from tensorrt_llm._torch.metadata import KVCacheParams
from tensorrt_llm.llmapi import (CudaGraphConfig, Eagle3DecodingConfig,
                                 KvCacheConfig)

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))


@pytest.fixture(scope="function")
def enforce_single_worker(monkeypatch):
    monkeypatch.setenv("TLLM_WORKER_USE_SINGLE_PROCESS", "1")
    yield


def test_kv_lens_runtime_with_eagle3_one_model():
    """
    Validates that kv_lens_runtime correctly excludes num_extra_kv_tokens when
    preparing attention metadata during EAGLE3 one-model speculative decoding.

    Background:
    - EAGLE3 reserves num_extra_kv_tokens = max_draft_len - 1 in KV cache for draft token management
    - kv_lens_runtime becomes host_past_key_value_lengths, which eventually becomes mMaxSeqLenKv in FMHA kernel
    - Bug: mMaxSeqLenKv was incorrectly set to actual_kv_length + num_extra_kv_tokens
    - Fix: mMaxSeqLenKv should be set to actual_kv_length only (without extra tokens)

    This test validates the fix by directly testing the prepare() logic.
    """

    # Test parameters
    num_seqs = 3
    num_extra_kv_tokens = 7  # e.g., max_draft_len = 8, so extra = 7
    prompt_lens = [50, 100, 75]  # These represent actual KV lengths
    seq_lens_q = [1, 1, 1]  # 1 token each in generation
    num_cached_tokens_per_seq = [
        prompt_lens[i] - seq_lens_q[i] for i in range(num_seqs)
    ]

    # Create a mock KV cache manager
    mock_kv_cache_manager = MagicMock()
    mock_kv_cache_manager.tokens_per_block = 32
    mock_kv_cache_manager.num_pools = 1
    mock_kv_cache_manager.max_blocks_per_seq = 16
    mock_kv_cache_manager.max_batch_size = num_seqs
    mock_kv_cache_manager.max_seq_len = 512  # Large enough to hold our test sequences
    mock_kv_cache_manager.impl.copy_batch_block_offsets = MagicMock()

    attn_metadata = TrtllmAttentionMetadata(
        max_num_requests=num_seqs,
        max_num_tokens=sum(seq_lens_q),
        kv_cache_manager=mock_kv_cache_manager,
    )

    # Set required attributes
    attn_metadata.request_ids = list(range(1, num_seqs + 1))
    attn_metadata.prompt_lens = prompt_lens
    attn_metadata._seq_lens = torch.tensor(seq_lens_q, dtype=torch.int32)
    # seq_lens_kv is the number of new KV tokens being added in this step (for generation, same as seq_lens_q)
    attn_metadata._seq_lens_kv = torch.tensor(seq_lens_q, dtype=torch.int32)

    # Set KV cache params with num_extra_kv_tokens (EAGLE3 one-model case)
    attn_metadata.kv_cache_params = KVCacheParams(
        use_cache=True,
        num_cached_tokens_per_seq=num_cached_tokens_per_seq,
        num_extra_kv_tokens=num_extra_kv_tokens)

    attn_metadata.prepare()
    actual_kv_lengths = torch.tensor(prompt_lens, dtype=torch.int32)

    # kv_lens_runtime should equal actual KV lengths (without extra tokens)
    kv_lens_runtime = attn_metadata.kv_lens_runtime[:num_seqs]
    assert torch.equal(kv_lens_runtime, actual_kv_lengths), \
        f"kv_lens_runtime should be {actual_kv_lengths.tolist()}, but got {kv_lens_runtime.tolist()}"

    # Internal kv_lens should include extra tokens
    kv_lens_internal = attn_metadata.kv_lens[:num_seqs]
    expected_kv_lens_with_extra = actual_kv_lengths + num_extra_kv_tokens
    assert torch.equal(kv_lens_internal, expected_kv_lens_with_extra), \
        f"kv_lens should be {expected_kv_lens_with_extra.tolist()}, but got {kv_lens_internal.tolist()}"


@pytest.mark.parametrize(
    "use_cuda_graph,attn_backend,disable_overlap_scheduler,enable_block_reuse,use_one_model,enable_chunked_prefill,use_chain_drafter,multi_batch,attention_dp",
    [
        [True, "TRTLLM", True, False, False, False, True, False, False],
        [True, "TRTLLM", True, False, False, False, False, False, False],
        [False, "TRTLLM", True, False, False, False, True, False, False],
        [False, "TRTLLM", True, False, False, False, False, False, False],
        [True, "FLASHINFER", True, False, False, False, True, False, False],
        [False, "FLASHINFER", True, False, False, False, True, False, False],
        [False, "TRTLLM", False, True, True, False, True, False, False],
        [True, "TRTLLM", False, True, True, False, True, False, False],
        [True, "TRTLLM", True, False, True, True, True, False, False],
        [True, "TRTLLM", True, False, True, False, True, False, False],
        [True, "TRTLLM", True, False, False, True, True, False, False],
        [True, "TRTLLM", False, False, False, False, True, False, False],
        [False, "TRTLLM", False, False, False, False, True, False, False],
        [True, "TRTLLM", False, False, False, False, False, True, False],
        [True, "TRTLLM", False, False, False, False, False, True, True],
        [False, "TRTLLM", False, False, False, False, False, True, False],
        [True, "TRTLLM", False, False, False, False, True, True, False],
        [False, "TRTLLM", False, False, False, False, True, True, False],
        [True, "TRTLLM", False, False, False, False, False, False, False],
        [False, "TRTLLM", False, False, False, False, False, False, False],
        [True, "TRTLLM", False, False, False, True, True, False, False],
        [True, "TRTLLM", False, False, False, True, False, False, False],
        [True, "FLASHINFER", False, False, False, False, True, False, False],
        [False, "FLASHINFER", False, False, False, False, True, False, False],
    ])
@pytest.mark.high_cuda_memory
def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
                      disable_overlap_scheduler: bool, enable_block_reuse: bool,
                      use_one_model: bool, enable_chunked_prefill: bool,
                      use_chain_drafter: bool, multi_batch: bool,
                      attention_dp: bool, request):
    # Eagle3 one model works with overlap scheduler and block reuse.
    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    if total_mem_gb < 35:
        pytest.skip("Not enough memory to load target + draft model")

    models_path = llm_models_root()
    eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
    target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"

    # bs > 1 gives non-deterministic when doing IFB. There are slight chances
    # that ref and spec does not match 100%
    max_batch_size = 4 if multi_batch else 1
    max_draft_len = 4
    kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
                                    max_tokens=8192)
    cuda_graph_config = CudaGraphConfig(
        batch_sizes=[i for i in range(1, max_batch_size +
                                      1)]) if use_cuda_graph else None

    llm_common_config = dict(
        model=target_model_dir,
        attn_backend=attn_backend,
        disable_overlap_scheduler=disable_overlap_scheduler,
        cuda_graph_config=cuda_graph_config,
        max_batch_size=max_batch_size,
        kv_cache_config=kv_cache_config,
        enable_attention_dp=attention_dp,
        # This max_seq_len is larger than the one specified
        # in the llama 3 8B eagle's config. We want to make sure
        # that the draft model won't go above its max in warmup
        # in this test.
        max_seq_len=8192,
        enable_chunked_prefill=enable_chunked_prefill,
    )
    if enable_chunked_prefill:
        # Use a small max_num_tokens so that the chunked prefill path gets exercised.
        llm_common_config['max_num_tokens'] = 64

    spec_config = Eagle3DecodingConfig(
        max_draft_len=max_draft_len,
        speculative_model_dir=eagle_model_dir,
        # Llama 3 does not support one model eagle.
        eagle3_one_model=use_one_model,
    )
    spec_config._allow_chain_drafter = use_chain_drafter

    # Create the LLM instance
    llm_spec = LLM(**llm_common_config, speculative_config=spec_config)

    # Acceptance rate tests
    if enable_chunked_prefill:
        # Use a long prompt for chunked prefill tests.
        prompts = [
            "The capital of France is a city of romance, art, fashion, and cuisine. Paris is a must-visit destination for anyone who loves history, architecture, and culture. From the iconic Eiffel Tower to the world-famous Louvre Museum, Paris has something to offer for every interest and age.\nThe city is divided into 20 arrondissements, each with its own unique character and charm. The Latin Quarter is a popular area for students and young travelers, while the Champs-Élysées is a hub for shopping and dining. The Montmartre neighborhood is famous for its bohemian vibe and stunning views of the city.\nParis is also known for its beautiful parks and gardens, such as the Luxembourg Gardens and the Tuileries Garden. The city has a rich history, with landmarks like the Notre-Dame Cathedral and the Arc de Triomphe. Visitors can also explore the city's many museums, including the Musée d'Orsay and the Musée Rodin.\nIn addition to its cultural and historical attractions, Paris is also a great destination for foodies. The city is famous for its cuisine, including croissants, baguettes, and cheese. Visitors can sample the city's famous dishes at one of the many restaurants, cafes, and "
        ]
        tok_ids = [llm_spec.tokenizer.encode(prompts[0])]
    else:
        prompts = [
            "The capital of France is",
            "The president of the United States is",
        ]
        tok_ids = [llm_spec.tokenizer.encode("The future of AI is")]
        if multi_batch:
            tok_ids.append(llm_spec.tokenizer.encode(prompts))

    sampling_params = SamplingParams(max_tokens=128, temperature=0)

    for i in range(len(tok_ids)):
        num_tokens = 0
        num_drafted = 0
        num_accepted = 0

        for output in llm_spec.generate_async(tok_ids[i],
                                              sampling_params,
                                              streaming=True):
            new_tokens = output.outputs[0].token_ids
            num_drafted += max_draft_len
            num_accepted += len(new_tokens) - num_tokens - 1
            num_tokens = len(new_tokens)

        accept_rate = num_accepted / num_drafted
        assert accept_rate > 0.10

    # Output tests
    sampling_params = SamplingParams(max_tokens=10, temperature=0)

    results_spec = llm_spec.generate(prompts, sampling_params)
    generated_text_spec = [result.outputs[0].text for result in results_spec]
    llm_spec.shutdown()

    llm_ref = LLM(**llm_common_config)
    results_ref = llm_ref.generate(prompts, sampling_params)
    generated_text_ref = [result.outputs[0].text for result in results_ref]
    llm_ref.shutdown()

    for text_spec, text_ref in zip(generated_text_spec, generated_text_ref):
        # The spec decode algorithm currently guarantees identical results
        assert text_spec == text_ref


@pytest.mark.parametrize("eagle3_one_model", [True, False])
def test_eagle3_spec_decoding_stats(eagle3_one_model):
    """Test that specDecodingStats are correctly populated in metrics endpoint"""
    models_path = llm_models_root()
    eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
    target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"

    # Skip if models don't exist
    if not os.path.exists(target_model_dir) or not os.path.exists(
            eagle_model_dir):
        pytest.skip(f"Required models not found")

    kv_cache_config = KvCacheConfig(enable_block_reuse=False,
                                    free_gpu_memory_fraction=0.6)
    spec_config = Eagle3DecodingConfig(
        max_draft_len=3,
        speculative_model_dir=eagle_model_dir,
        eagle3_one_model=eagle3_one_model,
    )

    with LLM(
            model=target_model_dir,
            speculative_config=spec_config,
            kv_cache_config=kv_cache_config,
            disable_overlap_scheduler=not eagle3_one_model,
            enable_iter_perf_stats=True,
            max_batch_size=4,
    ) as llm:
        # Generate some output to collect stats
        prompts = [
            "The capital of France is",
            "The president of the United States is",
        ]
        sampling_params = SamplingParams(max_tokens=20, temperature=0)
        llm.generate(prompts, sampling_params)

        # Get iteration stats
        stats = llm.get_stats(timeout=2)
        assert len(stats) > 0, "Should have iteration stats"

        # Find iterations with speculation (generation phase)
        iterations_with_spec = []
        for stat in stats:
            if 'specDecodingStats' in stat:
                spec_stats = stat['specDecodingStats']
                if spec_stats.get('numDraftTokens', 0) > 0:
                    iterations_with_spec.append(spec_stats)

        # Should have at least some iterations with spec decoding
        assert len(iterations_with_spec) > 0, \
            f"Should have iterations with specDecodingStats (found {len(iterations_with_spec)})"

        # Validate specDecodingStats structure and values
        for spec_stats in iterations_with_spec:
            # Check all fields are present
            assert 'numDraftTokens' in spec_stats
            assert 'numAcceptedTokens' in spec_stats
            assert 'numRequestsWithDraftTokens' in spec_stats
            assert 'acceptanceLength' in spec_stats
            assert 'iterLatencyMS' in spec_stats
            assert 'draftOverhead' in spec_stats

            # Validate value constraints
            assert spec_stats['numDraftTokens'] > 0
            assert 0 <= spec_stats['numAcceptedTokens'] <= spec_stats[
                'numDraftTokens']
            assert spec_stats['numRequestsWithDraftTokens'] > 0
            assert spec_stats['acceptanceLength'] >= 1.0
            assert spec_stats['iterLatencyMS'] >= 0.0
            assert 0.0 <= spec_stats['draftOverhead'] <= 1.0

        # Calculate overall acceptance rate
        total_draft = sum(s['numDraftTokens'] for s in iterations_with_spec)
        total_accepted = sum(s['numAcceptedTokens']
                             for s in iterations_with_spec)
        acceptance_rate = (total_accepted / total_draft *
                           100) if total_draft > 0 else 0

        # Should have reasonable acceptance rate for Eagle3
        assert acceptance_rate > 5.0, f"Acceptance rate too low: {acceptance_rate:.1f}%"


@pytest.mark.parametrize("use_cuda_graph", [True, False])
@pytest.mark.high_cuda_memory
def test_llama_eagle3_long_prompt(use_cuda_graph):
    # Eagle3 one model works with overlap scheduler and block reuse.
    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    if total_mem_gb < 35:
        pytest.skip("Not enough memory to load target + draft model")

    models_path = llm_models_root()
    eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
    target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"

    spec_config = Eagle3DecodingConfig(
        max_draft_len=3,
        speculative_model_dir=eagle_model_dir,
        eagle3_one_model=False,
    )

    if use_cuda_graph:
        cuda_graph_config = CudaGraphConfig(batch_sizes=[1])
    else:
        cuda_graph_config = None

    llm_spec = LLM(model=target_model_dir,
                   speculative_config=spec_config,
                   max_batch_size=1,
                   cuda_graph_config=cuda_graph_config,
                   disable_overlap_scheduler=True)

    prompt = [", ".join(str(i) for i in range(1000))]

    sampling_params = SamplingParams(max_tokens=10, temperature=0)
    results_spec = llm_spec.generate(prompt, sampling_params)

    generated_text_spec = [result.outputs[0].text for result in results_spec]
    llm_spec.shutdown()

    llm_ref = LLM(model=target_model_dir,
                  max_batch_size=1,
                  cuda_graph_config=None,
                  disable_overlap_scheduler=False)

    results_ref = llm_ref.generate(prompt, sampling_params)

    generated_text_ref = [result.outputs[0].text for result in results_ref]
    llm_ref.shutdown()

    # The LLM with speculation on should dynamically turn it off in this
    # test since it goes beyond the max seqlen. Thus, the text should be
    # _exactly_ the same, no need to use similarity scoring.
    assert generated_text_spec[0] == generated_text_ref[0]


def test_deepseek_eagle3():
    use_cuda_graph = True
    attn_backend = "TRTLLM"
    disable_overlap_scheduler = False
    enable_block_reuse = False
    use_one_model = False
    enable_chunked_prefill = False

    # Eagle3 one model works with overlap scheduler and block reuse.
    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    if total_mem_gb < 150:
        pytest.skip("Not enough memory to load target + draft model")

    models_path = llm_models_root()
    eagle_config = {
        'architectures': ['LlamaForCausalLMEagle3'],
        'attention_bias': False,
        'attention_dropout': 0.0,
        'bos_token_id': 128000,
        'eos_token_id': [128001, 128008, 128009],
        'eagle_config': {
            'use_aux_hidden_state': False,
            'use_input_layernorm_in_first_layer': True,
            'use_last_layernorm': True,
            'use_mtp_layernorm': False
        },
        'head_dim': 128,
        'hidden_act': 'silu',
        'hidden_size': 2560,
        'initializer_range': 0.02,
        'intermediate_size': 16384,
        'max_position_embeddings': 4096,
        'mlp_bias': False,
        'model_type': 'llama',
        'num_attention_heads': 32,
        'num_eagle_features': 1,
        'num_hidden_layers': 1,
        'num_key_value_heads': 8,
        'pretraining_tp': 1,
        'rms_norm_eps': 1e-05,
        'rope_scaling': {
            'factor': 8.0,
            'high_freq_factor': 4.0,
            'low_freq_factor': 1.0,
            'original_max_position_embeddings': 8192,
            'rope_type': 'llama3'
        },
        'rope_theta': 500000.0,
        'tie_word_embeddings': False,
        'torch_dtype': 'bfloat16',
        'transformers_version': '4.52.4',
        'use_cache': True,
        'vocab_size': 129280,
        'draft_vocab_size': 129280,
    }
    with tempfile.TemporaryDirectory() as temp_dir:
        eagle_model_dir = Path(temp_dir)
        config_path = eagle_model_dir / "config.json"
        with config_path.open("w") as f:
            json.dump(eagle_config, f, indent=2)
        target_model_dir = f"{models_path}/DeepSeek-V3-Lite/nvfp4_moe_only"

        # bs > 1 gives non-deterministic when doing IFB. There are slight chances
        # that ref and spec does not match 100%
        max_batch_size = 16
        max_draft_len = 3
        kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
                                        max_tokens=8192)
        cuda_graph_config = CudaGraphConfig(
            batch_sizes=[1]) if use_cuda_graph else None

        llm_common_config = dict(
            model=target_model_dir,
            attn_backend=attn_backend,
            disable_overlap_scheduler=disable_overlap_scheduler,
            cuda_graph_config=cuda_graph_config,
            max_batch_size=max_batch_size,
            max_num_tokens=4096,
            max_seq_len=4096,
            kv_cache_config=kv_cache_config,
            enable_chunked_prefill=enable_chunked_prefill,
        )

        spec_config = Eagle3DecodingConfig(
            max_draft_len=max_draft_len,
            speculative_model_dir=eagle_model_dir,
            # Llama 3 does not support one model eagle.
            eagle3_one_model=use_one_model,
            eagle3_layers_to_capture={29},
            load_format="dummy")

        llm_spec = LLM(**llm_common_config, speculative_config=spec_config)

        tok_ids = llm_spec.tokenizer.encode("The future of AI is")

        sampling_params = SamplingParams(max_tokens=32, temperature=0)
        for output in llm_spec.generate_async(tok_ids,
                                              sampling_params,
                                              streaming=True):
            pass


def test_deepseek_mla_eagle3():
    use_cuda_graph = True
    attn_backend = "TRTLLM"
    disable_overlap_scheduler = False
    enable_block_reuse = False
    use_one_model = True
    enable_chunked_prefill = False

    # Eagle3 one model works with overlap scheduler and block reuse.
    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    if total_mem_gb < 150:
        pytest.skip("Not enough memory to load target + draft model")

    models_path = llm_models_root()
    eagle_config = {
        "architectures": ["Eagle3DeepseekV3ForCausalLM"],
        "attention_bias": False,
        "attention_dropout": 0.0,
        "first_k_dense_replace": 1,
        "hidden_act": "silu",
        "hidden_size": 2560,
        "intermediate_size": 8192,
        "kv_lora_rank": 512,
        "max_position_embeddings": 4096,
        "model_type": "kimi_k2",
        "num_attention_heads": 32,
        "num_hidden_layers": 1,
        "num_key_value_heads": 32,
        "num_nextn_predict_layers": 0,
        "q_lora_rank": 1536,
        "qk_nope_head_dim": 128,
        "qk_rope_head_dim": 64,
        "rms_norm_eps": 1e-05,
        "rope_scaling": {
            "beta_fast": 1.0,
            "beta_slow": 1.0,
            "factor": 64.0,
            "mscale": 1.0,
            "mscale_all_dim": 1.0,
            "original_max_position_embeddings": 4096,
            "type": "yarn"
        },
        "rope_theta": 50000.0,
        "routed_scaling_factor": 2.827,
        "scoring_func": "sigmoid",
        "seq_aux": True,
        "topk_group": 1,
        "topk_method": "noaux_tc",
        "torch_dtype": "bfloat16",
        "torchscript": False,
        "transformers_version": "4.51.3",
        "use_bfloat16": False,
        "use_cache": True,
        "v_head_dim": 128,
        "vocab_size": 129280,
        "draft_vocab_size": 129280,
        "eagle_config": {
            "use_aux_hidden_state": True,
            "use_input_layernorm_in_first_layer": True,
            "use_last_layernorm": True,
            "use_mtp_layernorm": False
        }
    }
    with tempfile.TemporaryDirectory() as temp_dir:
        eagle_model_dir = Path(temp_dir)
        config_path = eagle_model_dir / "config.json"
        with config_path.open("w") as f:
            json.dump(eagle_config, f, indent=2)
        target_model_dir = f"{models_path}/DeepSeek-V3-Lite/nvfp4_moe_only"

        # bs > 1 gives non-deterministic when doing IFB. There are slight chances
        # that ref and spec does not match 100%
        max_batch_size = 16
        max_draft_len = 3
        kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
                                        max_tokens=8192)
        cuda_graph_config = CudaGraphConfig(
            batch_sizes=[1]) if use_cuda_graph else None

        llm_common_config = dict(
            model=target_model_dir,
            attn_backend=attn_backend,
            disable_overlap_scheduler=disable_overlap_scheduler,
            cuda_graph_config=cuda_graph_config,
            max_batch_size=max_batch_size,
            max_num_tokens=4096,
            max_seq_len=4096,
            kv_cache_config=kv_cache_config,
            enable_chunked_prefill=enable_chunked_prefill,
            load_format="dummy",
        )

        spec_config = Eagle3DecodingConfig(
            max_draft_len=max_draft_len,
            speculative_model_dir=eagle_model_dir,
            eagle3_one_model=use_one_model,
            load_format="dummy")

        llm_spec = LLM(**llm_common_config, speculative_config=spec_config)

        tok_ids = llm_spec.tokenizer.encode("The future of AI is")

        sampling_params = SamplingParams(max_tokens=32, temperature=0)
        for output in llm_spec.generate_async(tok_ids,
                                              sampling_params,
                                              streaming=True):
            pass


@pytest.mark.parametrize("use_one_model", [True, False])
def test_multi_eagle3(use_one_model: bool):
    use_cuda_graph = True
    attn_backend = "TRTLLM"
    disable_overlap_scheduler = False
    enable_block_reuse = False
    enable_chunked_prefill = False

    # Eagle3 one model works with overlap scheduler and block reuse.
    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    if total_mem_gb < 150:
        pytest.skip("Not enough memory to load target + draft model")

    models_path = llm_models_root()
    eagle_config = {
        'architectures': ['LlamaForCausalLMEagle3'],
        'attention_bias': False,
        'attention_dropout': 0.0,
        'bos_token_id': 128000,
        'eos_token_id': [128001, 128008, 128009],
        'eagle_config': {
            'use_aux_hidden_state': False,
            'use_input_layernorm_in_first_layer': True,
            'use_last_layernorm': True,
            'use_mtp_layernorm': False
        },
        'head_dim': 128,
        'hidden_act': 'silu',
        'hidden_size': 4096,
        'initializer_range': 0.02,
        'intermediate_size': 16384,
        'max_position_embeddings': 131072,
        'mlp_bias': False,
        'model_type': 'llama',
        'num_attention_heads': 32,
        'num_eagle_features': 1,
        'num_hidden_layers': 2,
        'num_key_value_heads': 8,
        'pretraining_tp': 1,
        'rms_norm_eps': 1e-05,
        'rope_scaling': {
            'factor': 8.0,
            'high_freq_factor': 4.0,
            'low_freq_factor': 1.0,
            'original_max_position_embeddings': 8192,
            'rope_type': 'llama3'
        },
        'rope_theta': 500000.0,
        'tie_word_embeddings': False,
        'torch_dtype': 'bfloat16',
        'transformers_version': '4.52.4',
        'use_cache': True,
        'vocab_size': 128256,
        'draft_vocab_size': 128256,
    }
    with tempfile.TemporaryDirectory() as temp_dir:
        eagle_model_dir = Path(temp_dir)
        config_path = eagle_model_dir / "config.json"
        with config_path.open("w") as f:
            json.dump(eagle_config, f, indent=2)
        target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"

        # bs > 1 gives non-deterministic when doing IFB. There are slight chances
        # that ref and spec does not match 100%
        max_batch_size = 16
        max_draft_len = 3
        kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
                                        free_gpu_memory_fraction=0.5)
        cuda_graph_config = CudaGraphConfig(
            batch_sizes=[1]) if use_cuda_graph else None

        llm_common_config = dict(
            model=target_model_dir,
            attn_backend=attn_backend,
            disable_overlap_scheduler=disable_overlap_scheduler,
            cuda_graph_config=cuda_graph_config,
            max_batch_size=max_batch_size,
            kv_cache_config=kv_cache_config,
            enable_chunked_prefill=enable_chunked_prefill,
            load_format="dummy",
        )

        spec_config = Eagle3DecodingConfig(
            max_draft_len=max_draft_len,
            speculative_model_dir=eagle_model_dir,
            # Llama 3 does not support one model eagle.
            eagle3_one_model=use_one_model,
            num_eagle_layers=2,
            load_format="dummy")

        llm_spec = LLM(**llm_common_config, speculative_config=spec_config)

        tok_ids = llm_spec.tokenizer.encode("The future of AI is")

        sampling_params = SamplingParams(max_tokens=32, temperature=0)
        for output in llm_spec.generate_async(tok_ids,
                                              sampling_params,
                                              streaming=True):
            pass


@pytest.mark.parametrize("disable_overlap_scheduler", [True, False])
def test_eagle3_cuda_graph_padding(disable_overlap_scheduler: bool):
    """Test CUDA graph padding with 3 requests and max_batch_size=4.

    This test verifies that when using CUDA graph with padding enabled,
    the system properly reserves one additional slot for the padded dummy request.
    Without this fix, there would be errors caused by no free slot.
    """
    attn_backend = "TRTLLM"
    enable_block_reuse = False
    use_one_model = False
    enable_chunked_prefill = False

    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    if total_mem_gb < 35:
        pytest.skip("Not enough memory to load target + draft model")

    models_path = llm_models_root()
    eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
    target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"

    # Test with 3 requests and max_batch_size=4 to trigger padding
    max_batch_size = 4
    max_draft_len = 4
    kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
                                    max_tokens=4096)
    cuda_graph_config = CudaGraphConfig(batch_sizes=[1, 2, 4],
                                        enable_padding=True)

    llm_common_config = dict(
        model=target_model_dir,
        attn_backend=attn_backend,
        disable_overlap_scheduler=disable_overlap_scheduler,
        cuda_graph_config=cuda_graph_config,
        max_batch_size=max_batch_size,
        kv_cache_config=kv_cache_config,
        max_seq_len=2048,
        enable_chunked_prefill=enable_chunked_prefill,
    )

    spec_config = Eagle3DecodingConfig(
        max_draft_len=max_draft_len,
        speculative_model_dir=eagle_model_dir,
        eagle3_one_model=use_one_model,
    )

    # Create the LLM instance
    llm_spec = LLM(**llm_common_config, speculative_config=spec_config)

    prompts = [
        "The capital of France is", "The president of the United States is",
        "The future of AI is"
    ]

    sampling_params = SamplingParams(max_tokens=2048, temperature=0)
    llm_spec.generate(prompts, sampling_params)
    llm_spec.shutdown()


@pytest.mark.parametrize("disable_overlap_scheduler", [True, False])
def test_eagle3_cdl_sampling(disable_overlap_scheduler: bool):
    """Test CDL sampling with 2 requests and max_batch_size=2."""
    attn_backend = "TRTLLM"
    enable_block_reuse = False
    use_one_model = False
    enable_chunked_prefill = False

    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    if total_mem_gb < 35:
        pytest.skip("Not enough memory to load target + draft model")

    models_path = llm_models_root()
    eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
    target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"

    max_batch_size = 1
    max_draft_len = 4
    kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
                                    max_tokens=8192)
    cuda_graph_config = CudaGraphConfig(batch_sizes=[1, 2, 4],
                                        enable_padding=True)

    llm_common_config = dict(
        model=target_model_dir,
        attn_backend=attn_backend,
        disable_overlap_scheduler=disable_overlap_scheduler,
        cuda_graph_config=cuda_graph_config,
        max_batch_size=max_batch_size,
        kv_cache_config=kv_cache_config,
        max_seq_len=8192,
        enable_chunked_prefill=enable_chunked_prefill,
    )

    spec_config = Eagle3DecodingConfig(
        max_draft_len=max_draft_len,
        speculative_model_dir=eagle_model_dir,
        eagle3_one_model=use_one_model,
    )

    # Create the LLM instance
    llm_spec = LLM(**llm_common_config, speculative_config=spec_config)

    prompts = ["The president of the United States is"]

    sampling_params = SamplingParams(max_tokens=20, temperature=1.0, top_p=0.9)
    llm_spec.generate(prompts, sampling_params)
    llm_spec.shutdown()


if __name__ == "__main__":
    unittest.main()