From 03b38e9fbf557113d69926a64d70a3c9ddcf271e Mon Sep 17 00:00:00 2001
From: mpikulski <206748156+ixlmar@users.noreply.github.com>
Date: Sat, 7 Feb 2026 05:31:11 +0100
Subject: [PATCH] [TRTLLM-10030][perf] avoid sync in PyTorchModelEngine when
 using beam search (#11341)

Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/model_engine.py | 3 ++-
 tensorrt_llm/_torch/pyexecutor/sampler.py      | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 37b6fa1e99..e6ff77c993 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -2714,7 +2714,8 @@ class PyTorchModelEngine(ModelEngine):
                 #Copy cache indirection to local buffer with offsets changing:  seq_slots[i] -> i
                 # Convert to GPU tensor to avoid implicit sync
                 gen_request_seq_slots_tensor = torch.tensor(
-                    gen_request_seq_slots, dtype=torch.long, device='cuda')
+                    gen_request_seq_slots, dtype=torch.long,
+                    pin_memory=True).to(device='cuda', non_blocking=True)
                 self.cache_indirection_attention[:num_generation_requests].copy_(
                     cache_indirection_buffer[gen_request_seq_slots_tensor])
             if cache_indirection_buffer is not None or is_cuda_graph_during_warmup:
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
index ed9aae6ccb..31e56ccb05 100644
--- a/tensorrt_llm/_torch/pyexecutor/sampler.py
+++ b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -898,6 +898,8 @@ class AsyncWorkerMixin:
 class TorchSampler(Sampler[SampleStateTorch], AsyncWorkerMixin):
     DEFAULT_MAX_TOPK_LOGPROBS = 20
 
+    SampleState = SampleStateTorch
+
     @override
     def get_cache_indirection(self) -> torch.Tensor | None:
         return self.store.cache_indirection