dify/api/core/plugin/utils/chunk_merger.py

from collections.abc import Generator
from dataclasses import dataclass, field
from typing import TypeVar, Union, cast

from core.agent.entities import AgentInvokeMessage
from core.tools.entities.tool_entities import ToolInvokeMessage

MessageType = TypeVar("MessageType", bound=Union[ToolInvokeMessage, AgentInvokeMessage])


@dataclass
class FileChunk:
    """
    Buffer for accumulating file chunks during streaming.
    """

    total_length: int
    bytes_written: int = field(default=0, init=False)
    data: bytearray = field(init=False)

    def __post_init__(self) -> None:
        self.data = bytearray(self.total_length)


def merge_blob_chunks(
    response: Generator[MessageType, None, None],
    max_file_size: int = 30 * 1024 * 1024,
    max_chunk_size: int = 8192,
) -> Generator[MessageType, None, None]:
    """
    Merge streaming blob chunks into complete blob messages.

    This function processes a stream of plugin invoke messages, accumulating
    BLOB_CHUNK messages by their ID until the final chunk is received,
    then yielding a single complete BLOB message.

    Args:
        response: Generator yielding messages that may include blob chunks
        max_file_size: Maximum allowed file size in bytes (default: 30MB)
        max_chunk_size: Maximum allowed chunk size in bytes (default: 8KB)

    Yields:
        Messages from the response stream, with blob chunks merged into complete blobs

    Raises:
        ValueError: If file size exceeds max_file_size or chunk size exceeds max_chunk_size
    """
    files: dict[str, FileChunk] = {}

    for resp in response:
        if resp.type == ToolInvokeMessage.MessageType.BLOB_CHUNK:
            assert isinstance(resp.message, ToolInvokeMessage.BlobChunkMessage)
            # Get blob chunk information
            chunk_id = resp.message.id
            total_length = resp.message.total_length
            blob_data = resp.message.blob
            is_end = resp.message.end

            # Initialize buffer for this file if it doesn't exist
            if chunk_id not in files:
                files[chunk_id] = FileChunk(total_length)

            # Check if file is too large (before appending)
            if files[chunk_id].bytes_written + len(blob_data) > max_file_size:
                # Delete the file if it's too large
                del files[chunk_id]
                raise ValueError(f"File is too large which reached the limit of {max_file_size / 1024 / 1024}MB")

            # Check if single chunk is too large
            if len(blob_data) > max_chunk_size:
                raise ValueError(f"File chunk is too large which reached the limit of {max_chunk_size / 1024}KB")

            # Append the blob data to the buffer
            files[chunk_id].data[files[chunk_id].bytes_written : files[chunk_id].bytes_written + len(blob_data)] = (
                blob_data
            )
            files[chunk_id].bytes_written += len(blob_data)

            # If this is the final chunk, yield a complete blob message
            if is_end:
                # Create the appropriate message type based on the response type
                message_class = type(resp)
                merged_message = message_class(
                    type=ToolInvokeMessage.MessageType.BLOB,
                    message=ToolInvokeMessage.BlobMessage(blob=files[chunk_id].data[: files[chunk_id].bytes_written]),
                    meta=resp.meta,
                )
                yield cast(MessageType, merged_message)
                # Clean up the buffer
                del files[chunk_id]
        else:
            yield resp