# Adapt from # https://github.com/vllm-project/vllm/blob/2e33fe419186c65a18da6668972d61d7bbc31564/vllm/multimodal/image.py from typing import Any, cast import cv2 import numpy as np import requests from PIL import Image from transformers import AutoImageProcessor from transformers.image_processing_utils import BaseImageProcessor def get_hf_image_processor( processor_name: str, *args: Any, trust_remote_code: bool = False, **kwargs: Any, ): """Load an image processor for the given model name via HuggingFace.""" try: processor = AutoImageProcessor.from_pretrained( processor_name, *args, trust_remote_code=trust_remote_code, **kwargs) except ValueError as e: # If the error pertains to the processor class not existing or not # currently being imported, suggest using the --trust-remote-code flag. # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors if not trust_remote_code: err_msg = ( "Failed to load the image processor. If the image processor is " "a custom processor not yet available in the HuggingFace " "transformers library, consider setting " "`trust_remote_code=True` in LLM or using the " "`--trust-remote-code` flag in the CLI.") raise RuntimeError(err_msg) from e else: raise e return cast(BaseImageProcessor, processor) def load_image(image: str) -> Image.Image: if image.startswith("http://") or image.startswith("https://"): image = Image.open(requests.get(image, stream=True, timeout=10).raw) else: image = Image.open(image) return image.convert("RGB") def load_video(video: str, num_frames: int = 10) -> list[Image.Image]: # Load video frames from a video file vidcap = cv2.VideoCapture(video) if not vidcap.isOpened(): raise ValueError( f"Video '{video}' could not be opened. Make sure opencv is installed with video support." ) # Find the last frame as frame count might not be accurate frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT)) while frame_count > 0: vidcap.set(cv2.CAP_PROP_POS_FRAMES, frame_count - 1) if vidcap.grab(): break frame_count -= 1 else: raise ValueError(f"Video '{video}' has no frames.") # Extract frames uniformly indices = np.round(np.linspace(0, frame_count - 1, num_frames)).astype(int) frames = {} for index in indices: if index in frames: continue vidcap.set(cv2.CAP_PROP_POS_FRAMES, index) success, frame = vidcap.read() if not success: continue frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frames[index] = Image.fromarray(frame) return [frames[index] for index in indices if index in frames]