# Configuration for multimodal (vision + text) models model_factory: AutoModelForImageTextToText