Namo-R1/namo/processor/image_processing_namo.py
2025-02-22 12:25:49 +08:00

100 lines
3.5 KiB
Python

from typing import Dict, List, Optional, Union
import numpy as np
from transformers.image_utils import ImageInput, is_valid_image
from transformers.image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
)
from transformers.utils import is_vision_available, logging
from transformers import CLIPImageProcessor
from transformers.image_transforms import resize
logger = logging.get_logger(__name__)
if is_vision_available():
import PIL
class NamoImageProcessor(CLIPImageProcessor):
model_input_names = ["pixel_values"]
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Override CLIP original resize logic, we matching to longest edge if too large
matching to shortest if too small, if within, we do nothing.
"""
minimal_divider = 28
config_shortest = size.get("shortest_edge", minimal_divider)
config_longest = size.get("longest_edge", 714)
orig_height, orig_width = image.shape[:2]
current_shortest = min(orig_height, orig_width)
current_longest = max(orig_height, orig_width)
# do nothing
if current_shortest >= config_shortest and current_longest <= config_longest:
# we don't apply divided with 28, not necessary
new_height = (orig_height // minimal_divider) * minimal_divider
new_width = (orig_width // minimal_divider) * minimal_divider
return resize(
image,
size=(new_height, new_width),
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
# Determine the appropriate scaling factor.
# If the image is too large, scale down using the longest edge.
if current_longest > config_longest:
scale = config_longest / current_longest
if current_shortest * scale < config_shortest:
# if current shortest too small after scale, we scale to shortest
scale = config_shortest / current_shortest
# If the image is too small, scale up using the shortest edge.
elif current_shortest < config_shortest:
scale = config_shortest / current_shortest
else:
scale = 1.0 # This branch should not be reached.
new_height = int(round(orig_height * scale))
new_width = int(round(orig_width * scale))
# if longest still excceed config_longest
if max(new_height, new_width) > config_longest:
# this will result restortion, but should not effect detections
if new_width > new_height:
new_width = config_longest
else:
new_height = config_longest
# ensure divided by 28 (14*2)
new_height = (new_height // minimal_divider) * minimal_divider
new_width = (new_width // minimal_divider) * minimal_divider
return resize(
image,
size=(new_height, new_width),
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
__all__ = ["NamoImageProcessor"]