diff --git a/tensorrt_llm/serve/chat_utils.py b/tensorrt_llm/serve/chat_utils.py index 581615a201..3adee52b9e 100644 --- a/tensorrt_llm/serve/chat_utils.py +++ b/tensorrt_llm/serve/chat_utils.py @@ -34,9 +34,16 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False): type: Required[Literal["video_url"]] +class ImageEmbedsData(TypedDict): + """Type definition for serialized image embeddings structure.""" + data: Required[str] + + class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False): """Type definition for image embeddings passed in base64-encoded PyTorch tensor format.""" - image_embeds: Required[str] + image_embeds: Required[ + # NB: Besides "data", could support "url" and "ipc_handle" in the future. + ImageEmbedsData] type: Required[Literal["image_embeds"]] @@ -75,7 +82,8 @@ MM_PARSER_MAP: dict[str, Callable[[ChatCompletionContentPartParam], Union[ "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", None), "image_embeds": - lambda part: _ImageEmbedsParser(part).get("image_embeds", None), + lambda part: _ImageEmbedsParser(part).get("image_embeds", {}).get( + "data", None), } # Map from content part tags used to directly provide embeddings diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py b/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py index 5968c7681e..dd51407e0b 100644 --- a/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py +++ b/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py @@ -197,7 +197,9 @@ def test_single_chat_session_image_embeds( assert image_content["type"] == "image_url" image_content.clear() image_content["type"] = "image_embeds" - image_content["image_embeds"] = b64encode(mm_embed_bytes).decode("ascii") + image_content["image_embeds"] = { + "data": b64encode(mm_embed_bytes).decode("ascii") + } # test single completion #