From 03b38e9fbf557113d69926a64d70a3c9ddcf271e Mon Sep 17 00:00:00 2001 From: mpikulski <206748156+ixlmar@users.noreply.github.com> Date: Sat, 7 Feb 2026 05:31:11 +0100 Subject: [PATCH] [TRTLLM-10030][perf] avoid sync in PyTorchModelEngine when using beam search (#11341) Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/model_engine.py | 3 ++- tensorrt_llm/_torch/pyexecutor/sampler.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 37b6fa1e99..e6ff77c993 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -2714,7 +2714,8 @@ class PyTorchModelEngine(ModelEngine): #Copy cache indirection to local buffer with offsets changing: seq_slots[i] -> i # Convert to GPU tensor to avoid implicit sync gen_request_seq_slots_tensor = torch.tensor( - gen_request_seq_slots, dtype=torch.long, device='cuda') + gen_request_seq_slots, dtype=torch.long, + pin_memory=True).to(device='cuda', non_blocking=True) self.cache_indirection_attention[:num_generation_requests].copy_( cache_indirection_buffer[gen_request_seq_slots_tensor]) if cache_indirection_buffer is not None or is_cuda_graph_during_warmup: diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py index ed9aae6ccb..31e56ccb05 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampler.py +++ b/tensorrt_llm/_torch/pyexecutor/sampler.py @@ -898,6 +898,8 @@ class AsyncWorkerMixin: class TorchSampler(Sampler[SampleStateTorch], AsyncWorkerMixin): DEFAULT_MAX_TOPK_LOGPROBS = 20 + SampleState = SampleStateTorch + @override def get_cache_indirection(self) -> torch.Tensor | None: return self.store.cache_indirection