mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-17 00:04:57 +08:00
* Why? Prior to this commit, we only supported a single multimodal input for E/P/D disaggregated serving. * What? This commit does a minor refactor of the multimodal embedding handles that cross process boundaries to enable this. Existing unit tests are updated accordingly to test this. The `RequestOutput` has its `mm_embedding_handle` replaced in favor of `disaggregated_params`, addressing a previous TODO. Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com>
33 lines
843 B
YAML
33 lines
843 B
YAML
methods:
|
|
aresult:
|
|
parameters: {}
|
|
return_annotation: tensorrt_llm.executor.result.GenerationResult
|
|
result:
|
|
parameters:
|
|
timeout:
|
|
annotation: Optional[float]
|
|
default: None
|
|
return_annotation: tensorrt_llm.executor.result.GenerationResult
|
|
properties:
|
|
request_id:
|
|
annotation: int
|
|
default: inspect._empty
|
|
prompt:
|
|
annotation: Optional[str]
|
|
default: inspect._empty
|
|
prompt_token_ids:
|
|
annotation: List[int]
|
|
default: inspect._empty
|
|
outputs:
|
|
annotation: List[tensorrt_llm.executor.result.CompletionOutput]
|
|
default: inspect._empty
|
|
context_logits:
|
|
annotation: Optional[torch.Tensor]
|
|
default: inspect._empty
|
|
finished:
|
|
annotation: bool
|
|
default: inspect._empty
|
|
disaggregated_params:
|
|
annotation: Optional[DisaggregatedParams]
|
|
default: inspect._empty
|