mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
chore: rename ExecutorBindingsWorker/Proxy (#4716)
Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
parent
2307e91122
commit
ac17142495
@ -12,8 +12,8 @@ __all__ = [
|
||||
"GenerationRequest",
|
||||
"LoRARequest",
|
||||
"PromptAdapterRequest",
|
||||
"ExecutorBindingsWorker",
|
||||
"ExecutorBindingsProxy",
|
||||
"GenerationExecutorWorker",
|
||||
"GenerationExecutorProxy",
|
||||
"RequestError",
|
||||
"CompletionOutput",
|
||||
"GenerationResultBase",
|
||||
|
||||
@ -35,8 +35,8 @@ from .result import GenerationResult, IterationResult
|
||||
from .utils import IntraProcessQueue, ProcessPoolExecutorSession, RequestError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .proxy import ExecutorBindingsProxy
|
||||
from .worker import ExecutorBindingsWorker
|
||||
from .proxy import GenerationExecutorProxy
|
||||
from .worker import GenerationExecutorWorker
|
||||
|
||||
__all__ = [
|
||||
"GenerationExecutor",
|
||||
@ -347,10 +347,10 @@ class GenerationExecutor(ABC):
|
||||
postproc_worker_config: Optional[PostprocWorkerConfig] = None,
|
||||
is_llm_executor: Optional[bool] = None,
|
||||
lora_config: Optional[LoraConfig] = None,
|
||||
) -> Union["ExecutorBindingsProxy", "ExecutorBindingsWorker"]:
|
||||
) -> Union["GenerationExecutorProxy", "GenerationExecutorWorker"]:
|
||||
# local imports to avoid cyclic importing
|
||||
from .proxy import ExecutorBindingsProxy
|
||||
from .worker import ExecutorBindingsWorker
|
||||
from .proxy import GenerationExecutorProxy
|
||||
from .worker import GenerationExecutorWorker
|
||||
|
||||
if world_size == 0:
|
||||
world_size = mpi_world_size()
|
||||
@ -385,7 +385,7 @@ class GenerationExecutor(ABC):
|
||||
if spawn_workers or (mpirun_launch and reuse_mpi_comm):
|
||||
if reuse_mpi_comm:
|
||||
assert mpi_session is not None, "reuse_mpi_comm requires an external MPI session"
|
||||
return ExecutorBindingsProxy(
|
||||
return GenerationExecutorProxy(
|
||||
worker_kwargs,
|
||||
model_world_size=model_world_size,
|
||||
mpi_session=mpi_session,
|
||||
@ -400,15 +400,15 @@ class GenerationExecutor(ABC):
|
||||
logger.warning(
|
||||
"Using single process worker for TP1, this may hurt streaming generation performance."
|
||||
)
|
||||
return ExecutorBindingsWorker(**worker_kwargs,
|
||||
is_llm_executor=is_llm_executor)
|
||||
return GenerationExecutorWorker(**worker_kwargs,
|
||||
is_llm_executor=is_llm_executor)
|
||||
|
||||
# For single-gpu case:
|
||||
# Partition the workload to multiple process for streaming performance.
|
||||
# While this requires uses to protect their entrypoint to
|
||||
# `if __name__ == "__main__":`.
|
||||
if not platform.system() == 'Windows':
|
||||
return ExecutorBindingsProxy(
|
||||
return GenerationExecutorProxy(
|
||||
worker_kwargs,
|
||||
model_world_size=model_world_size,
|
||||
mpi_session=None, # use mpi4py
|
||||
@ -419,7 +419,7 @@ class GenerationExecutor(ABC):
|
||||
# The ProcessPoolExecutorSession is used to support Windows, as mpi4py cannot.
|
||||
mpi_session = ProcessPoolExecutorSession(n_workers=1,
|
||||
mp_context=ctx)
|
||||
return ExecutorBindingsProxy(
|
||||
return GenerationExecutorProxy(
|
||||
worker_kwargs,
|
||||
model_world_size=model_world_size,
|
||||
mpi_session=mpi_session,
|
||||
|
||||
@ -26,14 +26,14 @@ from .result import GenerationResult, IterationResult
|
||||
from .utils import (ErrorResponse, IntraProcessQueue, WorkerCommIpcAddrs,
|
||||
create_mpi_comm_session, get_spawn_proxy_process_env,
|
||||
is_llm_response)
|
||||
from .worker import ExecutorBindingsWorker, worker_main
|
||||
from .worker import GenerationExecutorWorker, worker_main
|
||||
|
||||
__all__ = [
|
||||
"ExecutorBindingsProxy",
|
||||
"GenerationExecutorProxy",
|
||||
]
|
||||
|
||||
|
||||
class ExecutorBindingsProxy(GenerationExecutor):
|
||||
class GenerationExecutorProxy(GenerationExecutor):
|
||||
READY_SIGNAL = b"READY"
|
||||
|
||||
def __init__(
|
||||
@ -42,7 +42,7 @@ class ExecutorBindingsProxy(GenerationExecutor):
|
||||
model_world_size: int = 1,
|
||||
mpi_session: Optional[MpiSession] = None,
|
||||
*,
|
||||
worker_cls: type = ExecutorBindingsWorker,
|
||||
worker_cls: type = GenerationExecutorWorker,
|
||||
postproc_worker_config: Optional[PostprocWorkerConfig] = None,
|
||||
is_llm_executor: Optional[bool] = None,
|
||||
) -> None:
|
||||
@ -297,7 +297,7 @@ class ExecutorBindingsProxy(GenerationExecutor):
|
||||
worker_cls=self.worker_cls,
|
||||
tracer_init_kwargs=tracer_init_kwargs,
|
||||
_torch_model_class_mapping=MODEL_CLASS_MAPPING,
|
||||
ready_signal=ExecutorBindingsProxy.READY_SIGNAL,
|
||||
ready_signal=GenerationExecutorProxy.READY_SIGNAL,
|
||||
BASE_ZMQ_CLASSES=serialization.BASE_ZMQ_CLASSES)
|
||||
for fut in self.mpi_futures:
|
||||
fut.add_done_callback(mpi_done_callback)
|
||||
@ -315,7 +315,7 @@ class ExecutorBindingsProxy(GenerationExecutor):
|
||||
break
|
||||
self._handle_background_error()
|
||||
|
||||
if ready_signal != ExecutorBindingsProxy.READY_SIGNAL:
|
||||
if ready_signal != GenerationExecutorProxy.READY_SIGNAL:
|
||||
self.mpi_session.shutdown_abort(reason=ready_signal)
|
||||
raise ready_signal
|
||||
|
||||
|
||||
@ -77,7 +77,7 @@ BASE_ZMQ_CLASSES = {
|
||||
"Logprob", "LogProbsResult", "ResponseWrapper"
|
||||
],
|
||||
"tensorrt_llm.executor.utils": ["ErrorResponse", "WorkerCommIpcAddrs"],
|
||||
"tensorrt_llm.executor.worker": ["ExecutorBindingsWorker", "worker_main"],
|
||||
"tensorrt_llm.executor.worker": ["GenerationExecutorWorker", "worker_main"],
|
||||
"tensorrt_llm.llmapi.llm_args": [
|
||||
"_ModelFormatKind", "_ParallelConfig", "CalibConfig",
|
||||
"CapacitySchedulerPolicy", "KvCacheConfig", "LookaheadDecodingConfig",
|
||||
|
||||
@ -43,11 +43,11 @@ from .utils import (PERIODICAL_RESP_IN_AWAIT, ErrorResponse, IntraProcessQueue,
|
||||
is_llm_response)
|
||||
|
||||
__all__ = [
|
||||
"ExecutorBindingsWorker",
|
||||
"GenerationExecutorWorker",
|
||||
]
|
||||
|
||||
|
||||
class ExecutorBindingsWorker(GenerationExecutor):
|
||||
class GenerationExecutorWorker(GenerationExecutor):
|
||||
|
||||
class WorkerExit(GeneratorExit):
|
||||
pass
|
||||
@ -553,7 +553,7 @@ class ExecutorBindingsWorker(GenerationExecutor):
|
||||
if isinstance(self.engine, tllm.Executor):
|
||||
self.shutdown()
|
||||
raise self.WorkerExit(
|
||||
"block_subordinates() should be used in a `with ExecutorBindingsWorker() as ...:` block"
|
||||
"block_subordinates() should be used in a `with GenerationExecutorWorker() as ...:` block"
|
||||
)
|
||||
from tensorrt_llm._torch.pyexecutor.py_executor import PyExecutor
|
||||
if isinstance(self.engine, PyExecutor):
|
||||
@ -564,7 +564,7 @@ class ExecutorBindingsWorker(GenerationExecutor):
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback) -> bool:
|
||||
self.shutdown()
|
||||
return exc_type is None or exc_type == ExecutorBindingsWorker.WorkerExit
|
||||
return exc_type is None or exc_type == GenerationExecutorWorker.WorkerExit
|
||||
|
||||
def __del__(self):
|
||||
self.shutdown()
|
||||
@ -577,7 +577,7 @@ def worker_main(
|
||||
log_level: str,
|
||||
executor_config: Optional[tllm.ExecutorConfig] = None,
|
||||
batched_logits_processor: Optional[BatchedLogitsProcessor] = None,
|
||||
worker_cls: type = ExecutorBindingsWorker,
|
||||
worker_cls: type = GenerationExecutorWorker,
|
||||
tracer_init_kwargs: Optional[dict] = None,
|
||||
_torch_model_class_mapping: Optional[dict] = None,
|
||||
postproc_worker_config: Optional[PostprocWorkerConfig] = None,
|
||||
@ -707,7 +707,7 @@ def worker_main(
|
||||
"green")
|
||||
|
||||
try:
|
||||
worker: ExecutorBindingsWorker = worker_cls(
|
||||
worker: GenerationExecutorWorker = worker_cls(
|
||||
engine,
|
||||
executor_config,
|
||||
batched_logits_processor,
|
||||
@ -752,7 +752,7 @@ def worker_main(
|
||||
|
||||
notify_proxy_threads_to_quit()
|
||||
|
||||
except ExecutorBindingsWorker.WorkerExit as e:
|
||||
except GenerationExecutorWorker.WorkerExit as e:
|
||||
# This will capture by the with-statement and exit normally.
|
||||
raise e
|
||||
|
||||
@ -774,7 +774,7 @@ class AwaitResponseHelper:
|
||||
ipc_periodically = 2
|
||||
ipc_batched = 3
|
||||
|
||||
def __init__(self, worker: "ExecutorBindingsWorker"):
|
||||
def __init__(self, worker: "GenerationExecutorWorker"):
|
||||
# TODO: make worker weakref
|
||||
self.worker = worker
|
||||
self.handler_kind: AwaitResponseHelper.HandlerKind = AwaitResponseHelper.HandlerKind.unknown
|
||||
|
||||
@ -17,7 +17,7 @@ from pydantic import BaseModel
|
||||
from utils.util import skip_single_gpu
|
||||
|
||||
from tensorrt_llm.bindings import executor as tllm
|
||||
from tensorrt_llm.executor import (ExecutorBindingsWorker, LoRARequest,
|
||||
from tensorrt_llm.executor import (GenerationExecutorWorker, LoRARequest,
|
||||
PromptAdapterRequest, RequestError)
|
||||
from tensorrt_llm.llmapi import (LLM, BuildCacheConfig, EagleDecodingConfig,
|
||||
GuidedDecodingParams, KvCacheConfig,
|
||||
@ -1593,7 +1593,7 @@ def check_llm_return_context_logits(tp_size=1):
|
||||
|
||||
# Check the WAR for returning logits performance
|
||||
if tp_size == 1:
|
||||
assert isinstance(llm._executor, ExecutorBindingsWorker)
|
||||
assert isinstance(llm._executor, GenerationExecutorWorker)
|
||||
|
||||
|
||||
def check_llm_return_generation_logits(tp_size=1):
|
||||
@ -1617,7 +1617,7 @@ def check_llm_return_generation_logits(tp_size=1):
|
||||
|
||||
# Check the WAR for returning logits performance
|
||||
if tp_size == 1:
|
||||
assert isinstance(llm._executor, ExecutorBindingsWorker)
|
||||
assert isinstance(llm._executor, GenerationExecutorWorker)
|
||||
|
||||
|
||||
def test_llm_return_context_logits():
|
||||
@ -1726,7 +1726,7 @@ def test_llm_return_logprobs_streaming():
|
||||
llm_return_logprobs_test_harness(2, 2, False, True, streaming=True)
|
||||
|
||||
|
||||
class DummyExecutorWorker3(ExecutorBindingsWorker):
|
||||
class DummyExecutorWorker3(GenerationExecutorWorker):
|
||||
should_raise_error = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
||||
@ -8,7 +8,7 @@ from typing import Optional
|
||||
import pytest
|
||||
from parameterized import parameterized
|
||||
|
||||
from tensorrt_llm.executor import ExecutorBindingsProxy
|
||||
from tensorrt_llm.executor import GenerationExecutorProxy
|
||||
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
|
||||
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
|
||||
from tensorrt_llm.mapping import Mapping
|
||||
@ -374,7 +374,7 @@ class DummyExecutorMeta(type):
|
||||
return new_cls
|
||||
|
||||
|
||||
class DummyExecutorProxy2(ExecutorBindingsProxy):
|
||||
class DummyExecutorProxy2(GenerationExecutorProxy):
|
||||
''' This is for testing the error occur in the thread in the Proxy. '''
|
||||
|
||||
def __init__(
|
||||
@ -421,7 +421,7 @@ def _test_executor_handle_background_error_in_dispatch_result_thread():
|
||||
asyncio.run(task())
|
||||
|
||||
|
||||
class DummyExecutorProxy3(ExecutorBindingsProxy):
|
||||
class DummyExecutorProxy3(GenerationExecutorProxy):
|
||||
''' This is for testing the error occur in a Worker process in the Proxy. '''
|
||||
|
||||
def __init__(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user