TensorRT-LLMs/tensorrt_llm/_torch/speculative/drafter.py
Ziyi Xiong 8062e0fe7c
[TRTLLM-6392][feat] Support turning on/off spec decoding dynamically (#6363)
Signed-off-by: ziyixiong-nv <219238287+ziyixiong-nv@users.noreply.github.com>
2025-07-31 15:31:39 -04:00

29 lines
895 B
Python

from abc import ABC, abstractmethod
from typing import List, Optional
from ..pyexecutor.llm_request import LlmRequest
from ..pyexecutor.resource_manager import ResourceManager
from ..pyexecutor.scheduler import ScheduledRequests
class Drafter(ABC):
"""Abstract base class for all drafter implementations."""
@abstractmethod
def prepare_draft_tokens(
self,
scheduled_requests: ScheduledRequests,
resource_manager: Optional[ResourceManager] = None,
) -> None:
"""
Prepare the drafter tokens for the forward computation this step.
Args:
scheduled_requests: The scheduled requests for this iteration
"""
raise NotImplementedError
def should_use_spec_decode(self, requests: List[LlmRequest]) -> bool:
"""Check if spec decode should be used for the current iteration."""
return True