diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py index 0007b99ebd..ae25544b8b 100644 --- a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py +++ b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py @@ -304,6 +304,7 @@ class CUDAGraphRunner: self.static_inputs.clear() self.graph_outputs.clear() self.graph_metadata.clear() + self.padding_dummy_request = None del self.memory_pool self.memory_pool = None torch.cuda.empty_cache() diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index 3395490039..6298db0146 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -1042,6 +1042,13 @@ class SlotManager: slot = self.slot_mapping.pop(request_id) self.free_slots.add(slot) + def shutdown(self): + req_ids_list = list(self.slot_mapping.keys()) + for rid in req_ids_list: + self.remove_slot(rid) + assert len(self.slot_mapping) == 0 and len( + self.free_slots) == self.max_num_requests + class ResourceManager: diff --git a/tensorrt_llm/_torch/speculative/mtp.py b/tensorrt_llm/_torch/speculative/mtp.py index b31512df91..aa4ff35a16 100644 --- a/tensorrt_llm/_torch/speculative/mtp.py +++ b/tensorrt_llm/_torch/speculative/mtp.py @@ -85,7 +85,7 @@ class MTPHiddenStatesManager(BaseResourceManager): self.slot_manager.add_slot(rid) def shutdown(self): - pass + self.slot_manager.shutdown() def get_max_resource_count(self) -> int: return self.max_num_requests