diff --git a/tensorrt_llm/_torch/pyexecutor/kv_cache_connector.py b/tensorrt_llm/_torch/pyexecutor/kv_cache_connector.py index 59f7256c98..380486e935 100644 --- a/tensorrt_llm/_torch/pyexecutor/kv_cache_connector.py +++ b/tensorrt_llm/_torch/pyexecutor/kv_cache_connector.py @@ -49,6 +49,7 @@ from tensorrt_llm.bindings.internal.batch_manager import \ from tensorrt_llm.bindings.internal.batch_manager import LlmRequest from tensorrt_llm.llmapi.llm_args import TorchLlmArgs +from .llm_request import get_draft_token_length from .scheduler import ScheduledRequests if TYPE_CHECKING: @@ -310,7 +311,8 @@ class KvCacheConnectorSchedulerOutputRequest: req.context_chunk_size) else: computed_position = len(tokens) - 1 - num_scheduled_tokens = 1 # Specdec with draft tokens is not supported yet. + num_scheduled_tokens = 1 + get_draft_token_length( + req) # Specdec with draft tokens is not supported yet. return RequestData(req.request_id, new_tokens, new_block_ids, computed_position, num_scheduled_tokens) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index 46960dabe7..50feb71943 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -554,9 +554,6 @@ def create_py_executor( raise NotImplementedError( "KV connector is only supported with guaranteed no evict scheduler policy." ) - elif spec_config is not None: - raise NotImplementedError( - "KV connector is not supported with speculative decoding.") try: module = importlib.import_module( kv_connector_config.connector_module)