From 7a103035be21b0afc41f2795ca5e009b17d2bb26 Mon Sep 17 00:00:00 2001 From: Robin Kobus <19427718+Funatiq@users.noreply.github.com> Date: Wed, 11 Feb 2026 17:46:25 +0100 Subject: [PATCH] [None][fix] Remove overlap scheduler adjustment for max sequence length in create_py_executor function (#9229) Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/_util.py | 6 ++---- tensorrt_llm/_torch/pyexecutor/model_engine.py | 3 +-- tensorrt_llm/_torch/pyexecutor/py_executor_creator.py | 6 ++---- tensorrt_llm/_torch/pyexecutor/sampler.py | 2 +- 4 files changed, 6 insertions(+), 11 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index 0343f746ad..d5a3e04054 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -273,10 +273,8 @@ class KvCacheCreator: num_cache_blocks = 0 num_extra_tokens_per_seq = 1 # account for generated tokens spec_cfg = self._speculative_config - if not self._llm_args.disable_overlap_scheduler: - num_extra_tokens_per_seq = num_extra_tokens_per_seq + 1 - if spec_cfg is not None: - num_extra_tokens_per_seq += spec_cfg.max_total_draft_tokens + if not self._llm_args.disable_overlap_scheduler and spec_cfg is not None: + num_extra_tokens_per_seq += spec_cfg.max_total_draft_tokens if spec_cfg is not None: num_extra_tokens_per_seq += spec_cfg.max_total_draft_tokens diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 6b2a77dcf5..5bf0c2f78b 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -457,8 +457,7 @@ class PyTorchModelEngine(ModelEngine): # This way it can also be used for CUDA graphs. if self.use_beam_search: self.cache_indirection_attention = torch.zeros( - (self.batch_size, self.max_beam_width, self.max_seq_len + - (0 if self._disable_overlap_scheduler else 1)), + (self.batch_size, self.max_beam_width, self.max_seq_len), device="cuda", dtype=torch.int32) else: diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index 944192eeac..588ddae548 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -436,10 +436,8 @@ def create_py_executor( # PyTorchModelEngine modifies these fields, update them model_engine_max_seq_len = model_engine.max_seq_len net_max_seq_len = model_engine_max_seq_len - if not llm_args.disable_overlap_scheduler: - model_engine_max_seq_len = model_engine.max_seq_len + 1 - if spec_config is not None: - model_engine_max_seq_len += spec_config.max_total_draft_tokens + if not llm_args.disable_overlap_scheduler and spec_config is not None: + model_engine_max_seq_len += spec_config.max_total_draft_tokens if spec_config is not None: model_engine_max_seq_len += get_num_extra_kv_tokens(spec_config) diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py index 27e2b99b59..30943f978e 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampler.py +++ b/tensorrt_llm/_torch/pyexecutor/sampler.py @@ -1047,7 +1047,7 @@ class TorchSampler(Sampler[SampleStateTorch], AsyncWorkerMixin): self.CACHE_INDIRECTION_SHAPE = ( self.max_num_sequences, self.max_beam_width, - self.max_seq_len + (0 if args.disable_overlap_scheduler else 1), + self.max_seq_len, ) self.LOGPROBS_SHAPE = (self.max_num_sequences, self.max_beam_width, self.max_tokens) self.TOPK_LOGPROBS_SHAPE = (self.max_num_sequences, self.max_tokens, self.max_topk_logprobs)