mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-22 19:52:38 +08:00
85 lines
2.9 KiB
Python
85 lines
2.9 KiB
Python
# Adapt from
|
|
# https://github.com/vllm-project/vllm/blob/2e33fe419186c65a18da6668972d61d7bbc31564/vllm/multimodal/image.py
|
|
from typing import Any, cast
|
|
|
|
import cv2
|
|
import numpy as np
|
|
import requests
|
|
from PIL import Image
|
|
from transformers import AutoImageProcessor
|
|
from transformers.image_processing_utils import BaseImageProcessor
|
|
|
|
|
|
def get_hf_image_processor(
|
|
processor_name: str,
|
|
*args: Any,
|
|
trust_remote_code: bool = False,
|
|
**kwargs: Any,
|
|
):
|
|
"""Load an image processor for the given model name via HuggingFace."""
|
|
|
|
try:
|
|
processor = AutoImageProcessor.from_pretrained(
|
|
processor_name,
|
|
*args,
|
|
trust_remote_code=trust_remote_code,
|
|
**kwargs)
|
|
except ValueError as e:
|
|
# If the error pertains to the processor class not existing or not
|
|
# currently being imported, suggest using the --trust-remote-code flag.
|
|
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
|
|
if not trust_remote_code:
|
|
err_msg = (
|
|
"Failed to load the image processor. If the image processor is "
|
|
"a custom processor not yet available in the HuggingFace "
|
|
"transformers library, consider setting "
|
|
"`trust_remote_code=True` in LLM or using the "
|
|
"`--trust-remote-code` flag in the CLI.")
|
|
raise RuntimeError(err_msg) from e
|
|
else:
|
|
raise e
|
|
|
|
return cast(BaseImageProcessor, processor)
|
|
|
|
|
|
def load_image(image: str) -> Image.Image:
|
|
if image.startswith("http://") or image.startswith("https://"):
|
|
image = Image.open(requests.get(image, stream=True, timeout=10).raw)
|
|
else:
|
|
image = Image.open(image)
|
|
return image.convert("RGB")
|
|
|
|
|
|
def load_video(video: str, num_frames: int = 10) -> list[Image.Image]:
|
|
# Load video frames from a video file
|
|
vidcap = cv2.VideoCapture(video)
|
|
|
|
if not vidcap.isOpened():
|
|
raise ValueError(
|
|
f"Video '{video}' could not be opened. Make sure opencv is installed with video support."
|
|
)
|
|
|
|
# Find the last frame as frame count might not be accurate
|
|
frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
while frame_count > 0:
|
|
vidcap.set(cv2.CAP_PROP_POS_FRAMES, frame_count - 1)
|
|
if vidcap.grab():
|
|
break
|
|
frame_count -= 1
|
|
else:
|
|
raise ValueError(f"Video '{video}' has no frames.")
|
|
|
|
# Extract frames uniformly
|
|
indices = np.round(np.linspace(0, frame_count - 1, num_frames)).astype(int)
|
|
frames = {}
|
|
for index in indices:
|
|
if index in frames:
|
|
continue
|
|
vidcap.set(cv2.CAP_PROP_POS_FRAMES, index)
|
|
success, frame = vidcap.read()
|
|
if not success:
|
|
continue
|
|
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
|
frames[index] = Image.fromarray(frame)
|
|
return [frames[index] for index in indices if index in frames]
|