fix: add warmup flag into py_executor to prevent enable profiler during wa… (#3852)

* add warmup flag into py_executor to prevent enable profiler during warmup Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com> * fix bug of pre-commit Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com> * change setting warmup to all ranks Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com> --------- Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-04-27 19:22:42 +08:00 · 2025-04-27 19:22:42 +08:00 · 76f2c631fb
commit 76f2c631fb
parent e2318756ed
2 changed files with 5 additions and 2 deletions
--- a/tensorrt_llm/_torch/pyexecutor/_util.py
+++ b/tensorrt_llm/_torch/pyexecutor/_util.py
@ -206,6 +206,7 @@ def estimate_max_kv_cache_tokens(py_executor: PyExecutor,
        req = create_dummy_context_requests(max_num_tokens, seq_len, vocab_size)
        req_ids = py_executor.enqueue_requests(req)
    req_ids = mpi_broadcast(req_ids, root=0)
+    py_executor.is_warmup = True
    py_executor.start_worker()
    py_executor.await_responses(req_ids)
    # TODO check why call mpi_barrier() here will hang-on, but call mpi_allgather(0) is fine.
@ -250,6 +251,7 @@ def estimate_max_kv_cache_tokens(py_executor: PyExecutor,
    py_executor.resource_manager.resource_managers.get(
        "kv_cache_manager").shutdown()

+    py_executor.is_warmup = False
    if py_executor.dist.mapping.rank == 0:
        py_executor.shutdown()

--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@ -174,6 +174,7 @@ class PyExecutor:
        self.profile_start_iters, self.profile_stop_iters = _load_iteration_indexes(
            PROFILE_START_STOP_ENV_VAR_NAME)
        self.gc_nvtx_watcher_handle = _gc_nvtx_watcher()
+        self.is_warmup = False  # During warmup, we don't enable the profiler

        # related modules
        self.resource_manager = resource_manager
@ -444,7 +445,7 @@ class PyExecutor:

        def profile_step():
            nonlocal it, enabled, start_time
-            if it in self.profile_stop_iters:
+            if it in self.profile_stop_iters and not self.is_warmup:
                assert enabled, "Inconsistent CUDA profiling state"
                if enable_torch_trace:
                    torch_profiler.stop()
@ -470,7 +471,7 @@ class PyExecutor:

            it += 1

-            if it in self.profile_start_iters:
+            if it in self.profile_start_iters and not self.is_warmup:
                assert not enabled, "Inconsistent CUDA profiling state"
                torch.cuda.cudart().cudaProfilerStart()
                if enable_torch_trace: