[None][perf] Autotune TRT-LLM Gen MoE when using CUDA graphs (#7285)

Signed-off-by: Jinyang Yuan <154768711+jinyangyuan-nvidia@users.noreply.github.com>
This commit is contained in:
Jinyang Yuan 2025-09-03 10:08:59 +08:00 committed by GitHub
parent 109f27265c
commit 572551b586
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -825,6 +825,12 @@ class PyTorchModelEngine(ModelEngine):
f"Run generation only CUDA graph warmup for batch size={bs}, draft_len={draft_len}"
)
self.enable_spec_decode = draft_len > 0 or self.is_draft_model
if self.pytorch_backend_config.enable_autotuner:
with self.no_cuda_graph(), autotune():
self.forward(batch,
new_tensors_device=None,
resource_manager=resource_manager)
torch.cuda.synchronize()
self.forward(batch,
new_tensors_device=None,
resource_manager=resource_manager)