mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[None][perf] Autotune TRT-LLM Gen MoE when using CUDA graphs (#7285)
Signed-off-by: Jinyang Yuan <154768711+jinyangyuan-nvidia@users.noreply.github.com>
This commit is contained in:
parent
109f27265c
commit
572551b586
@ -825,6 +825,12 @@ class PyTorchModelEngine(ModelEngine):
|
||||
f"Run generation only CUDA graph warmup for batch size={bs}, draft_len={draft_len}"
|
||||
)
|
||||
self.enable_spec_decode = draft_len > 0 or self.is_draft_model
|
||||
if self.pytorch_backend_config.enable_autotuner:
|
||||
with self.no_cuda_graph(), autotune():
|
||||
self.forward(batch,
|
||||
new_tensors_device=None,
|
||||
resource_manager=resource_manager)
|
||||
torch.cuda.synchronize()
|
||||
self.forward(batch,
|
||||
new_tensors_device=None,
|
||||
resource_manager=resource_manager)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user