mtmd: refactor llava-uhd overview image handling (always use ov_img_first) (#24769)

* add dedicated "overview" for mtmd_image_preproc_out * corrections * correct (again) * nits * nits (2)
2026-06-26 06:10:19 +00:00 · 2026-06-18 18:53:49 +02:00
parent d2c67959b3
commit 060ce1bf72
3 changed files with 113 additions and 88 deletions
@@ -26,6 +26,13 @@ void mtmd_image_preproc_out::append(const clip_hparams & hparams, clip_image_f32
    entries.push_back(std::move(img));
 }

+void mtmd_image_preproc_out::append_overview(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized) {
+    overview.from_u8(img);
+    if (normalized) {
+        overview.normalize(hparams.image_mean, hparams.image_std);
+    }
+}
+
 // set of tools to manipulate images
 // in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
 struct img_tool {
@@ -607,10 +614,11 @@ private:
 mtmd_image_preproc_out mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img) {
    const clip_image_size original_size = img.get_size();
    auto const inst = get_slice_instructions(original_size);
-    std::vector<clip_image_u8> imgs = slice_image(img, inst);
+    auto sliced = slice_image(img, inst);

    mtmd_image_preproc_out output;
-    output.append(hparams, imgs, true);
+    output.append_overview(hparams, sliced.overview, true);
+    output.append(hparams, sliced.slices, true);
    output.grid_x = inst.grid_size.width;
    output.grid_y = inst.grid_size.height;

@@ -722,22 +730,15 @@ mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_ll
    return res;
 }

-std::vector<clip_image_u8> mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst, bool overview_first) {
-    std::vector<clip_image_u8> output;
+mtmd_image_preprocessor_llava_uhd::slice_output mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst) {
+    slice_output output;

    // resize to overview size
-    clip_image_u8 resized_img;
-    img_tool::resize(img, resized_img, inst.overview_size, hparams.image_resize_algo_ov,
+    img_tool::resize(img, output.overview, inst.overview_size, hparams.image_resize_algo_ov,
                        hparams.image_pad_ov, hparams.image_pad_color_ov);
-    if (overview_first) {
-        output.push_back(resized_img);
-    }

    if (inst.slices.empty()) {
-        // no slices, just return the resized image
-        if (!overview_first) {
-            output.push_back(resized_img);
-        }
+        // no slices, just return the overview image
        return output;
    }

@@ -755,11 +756,7 @@ std::vector<clip_image_u8> mtmd_image_preprocessor_llava_uhd::slice_image(const

        clip_image_u8 img_slice;
        img_tool::crop(refined_img, img_slice, x, y, w, h);
-        output.push_back(std::move(img_slice));
-    }
-
-    if (!overview_first) {
-        output.push_back(resized_img);
+        output.slices.push_back(std::move(img_slice));
    }

    return output;
@@ -1077,10 +1074,11 @@ mtmd_image_preproc_out mtmd_image_preprocessor_idefics3::preprocess(const clip_i
            });
        }
    }
-    auto imgs = slice_image(img, instructions);
+    auto sliced = slice_image(img, instructions);

    mtmd_image_preproc_out output;
-    output.append(hparams, imgs, true);
+    output.append_overview(hparams, sliced.overview, true);
+    output.append(hparams, sliced.slices, true);
    output.grid_x = instructions.grid_size.width;
    output.grid_y = instructions.grid_size.height;
    return output;
@@ -1094,10 +1092,12 @@ mtmd_image_preproc_out mtmd_image_preprocessor_internvl::preprocess(const clip_i
    GGML_ASSERT(!hparams.image_res_candidates.empty());
    const clip_image_size original_size = img.get_size();
    auto const inst = get_slice_instructions(original_size);
-    std::vector<clip_image_u8> imgs = slice_image(img, inst, false);
+    auto sliced = slice_image(img, inst);

    mtmd_image_preproc_out output;
-    output.append(hparams, imgs, true);
+    // InternVL: slices first, then overview
+    output.append(hparams, sliced.slices, true);
+    output.append_overview(hparams, sliced.overview, true);
    output.grid_x = inst.grid_size.width;
    output.grid_y = inst.grid_size.height;
    return output;
@@ -1131,9 +1131,10 @@ mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr::preprocess(const cli
    img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
                     PAD_NEAREST, hparams.image_pad_color);
    mtmd_image_preproc_out output;
-    output.append(hparams, padded, true);
-    output.grid_x = 1;
-    output.grid_y = 1;
+    output.append_overview(hparams, padded, true);
+    output.grid_x = 0;
+    output.grid_y = 0;
+    // TODO @ngxson : support slicing for DeepSeek-OCR, to do in another PR
    return output;
 }

@@ -1226,10 +1227,8 @@ mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr2::preprocess(const cl
    clip_image_u8 padded;
    img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
                     PAD_NEAREST, hparams.image_pad_color);
-    output.append(hparams, padded, true);
-    output.entries.back().add_viewsep = true;
-    output.grid_x = 1;
-    output.grid_y = 1;
+    output.append_overview(hparams, padded, true);
+    output.overview.add_viewsep = true;
    return output;
 }

@@ -1447,15 +1446,14 @@ mtmd_image_preproc_out mtmd_image_preprocessor_step3vl::preprocess(const clip_im
    const auto instructions = build_slice_instructions(hparams, prepared.get_size());

    mtmd_image_preproc_out output;
-    clip_image_f32 overview_f32;
+    // overview (normalized f32, already includes mean/std)
    img_u8_resize_bilinear_to_f32(
        prepared,
-        overview_f32,
+        output.overview,
        hparams.image_size,
        hparams.image_size,
        hparams.image_mean,
        hparams.image_std);
-    output.append(hparams, overview_f32, false);

    if (instructions.slices.empty()) {
        output.grid_x = 0;
@@ -1548,13 +1546,13 @@ mtmd_image_preproc_out mtmd_image_preprocessor_youtuvl::preprocess(const clip_im

 mtmd_image_preproc_out mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img) {
    auto output = mtmd_image_preprocessor_llava_uhd::preprocess(img);
-    if (output.entries.size() == 1) {
+    if (output.entries.size() == 0) {
        // Single-tile (overview only): append one newline row.
-        output.entries[0].add_newline = true;
+        output.overview.add_newline = true;
    } else {
        // Multi-tile: overview gets no newline, grid tiles get one.
-        output.entries[0].add_newline = false;
-        for (size_t i = 1; i < output.entries.size(); ++i) {
+        output.overview.add_newline = false;
+        for (size_t i = 0; i < output.entries.size(); ++i) {
            output.entries[i].add_newline = true;
        }
    }
@@ -11,11 +11,19 @@
 struct mtmd_image_preproc_out {
    std::vector<clip_image_f32> entries;
    // grid size is required for llava-uhd style models
+
+    clip_image_f32 overview; // overview image (downscaled image)
    int grid_x = 0;
    int grid_y = 0;
+
    void append(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized = true);
    void append(const clip_hparams & hparams, const std::vector<clip_image_u8> & imgs, bool normalized = true);
    void append(const clip_hparams & hparams, clip_image_f32 & img, bool normalized = true);
+
+    void append_overview(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized = true);
+    bool has_overview() const {
+        return overview.nx() > 0 || overview.ny() > 0;
+    }
 };

 // base class, models must inherit from this class
@@ -46,6 +54,8 @@ struct mtmd_image_preprocessor {
 * [overview] --> [slice 1] --> [slice 2]
 *           |                |
 *           +--> [slice 3] --> [slice 4]
+ *
+ * NOTE: for the ordering of overview, set "ov_img_first" on the mtmd_context
 */
 struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
    mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
@@ -67,7 +77,11 @@ struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
    // LFM2 override this function to implement its custom slicing logic
    virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);

-    std::vector<clip_image_u8> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);
+    struct slice_output {
+        clip_image_u8 overview;
+        std::vector<clip_image_u8> slices;
+    };
+    slice_output slice_image(const clip_image_u8 & img, const slice_instructions & inst);

 private:
    clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);
@@ -516,6 +516,7 @@ struct mtmd_context {
                    LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
                            "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
                    image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
+                    ov_img_first = false;
                } break;
            case PROJECTOR_TYPE_STEP3VL:
                {
@@ -539,6 +540,7 @@ struct mtmd_context {
                    img_beg = "<img>";
                    img_end = "</img>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_internvl>(ctx_v);
+                    ov_img_first = false;
                } break;
            case PROJECTOR_TYPE_KIMIVL:
                {
@@ -615,11 +617,13 @@ struct mtmd_context {
                {
                    img_end = "\n"; // prevent empty batch on llama-server
                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
+                    ov_img_first = false;
                } break;
            case PROJECTOR_TYPE_DEEPSEEKOCR2:
                {
                    img_end = "\n"; // prevent empty batch on llama-server
                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
+                    ov_img_first = false;
                } break;
            case PROJECTOR_TYPE_HUNYUANVL:
                {
@@ -640,6 +644,7 @@ struct mtmd_context {
                    img_beg = "<image>";
                    img_end = "";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_granite>(ctx_v);
+                    ov_img_first = true;
                } break;
            default:
                throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
@@ -1079,26 +1084,38 @@ struct mtmd_tokenizer {

                // for llava-uhd style, we need to handle grid too
                // we don't care about overwriting these values for now because the case where bitmaps.size() > 1 is only for frame merging (qwen-vl), not supported by llava-uhd
-                if (tmp_preproc_out.grid_x > 0 && tmp_preproc_out.grid_y > 0) {
+                if ((tmp_preproc_out.grid_x > 0 && tmp_preproc_out.grid_y > 0)
+                        || tmp_preproc_out.has_overview()) {
                    GGML_ASSERT(bitmaps.size() == 1);
                    preproc_out.grid_x = tmp_preproc_out.grid_x;
                    preproc_out.grid_y = tmp_preproc_out.grid_y;
+                    preproc_out.overview = std::move(tmp_preproc_out.overview);
                }
            }

+            LOG_DBG("%s: preproc_out has %zu entries, grid_x = %d, grid_y = %d, has_overview = %d\n",
+                    __func__, preproc_out.entries.size(), preproc_out.grid_x, preproc_out.grid_y,
+                    preproc_out.has_overview() ? 1 : 0);
+
            // handle llava-uhd style preprocessing
-            const bool has_tiling_grid = preproc_out.grid_x > 0 && preproc_out.grid_y > 0;
+            // (output either a grid, or overview-only)
+            const bool has_tiling_grid = (preproc_out.grid_x > 0 && preproc_out.grid_y > 0)
+                || preproc_out.has_overview();
+
            if (has_tiling_grid) {
                // [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
                GGML_ASSERT(bitmaps.size() == 1);

                const int n_col = preproc_out.grid_x;
                const int n_row = preproc_out.grid_y;
+
                // split batch into chunks of single images
-                // NOTE: preproc_out will be invalidated after this call
                auto chunks = split_batch_to_chunk(std::move(preproc_out), bitmaps[0]->id);
                GGML_ASSERT(chunks.size() > 0);

+                // NOTE: preproc_out is invalidated after this point, do not use it anymore
+
+                // split_batch_to_chunk must always put the overview image first
                auto ov_chunk = std::move(chunks.front());
                chunks.erase(chunks.begin());

@@ -1125,7 +1142,16 @@ struct mtmd_tokenizer {
                                std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
                                add_text(std::string(buf.get(), buf.get() + sz - 1), true);
                            }
-                            cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
+
+                            auto & curr_chunk = chunks[y * n_col + x];
+                            auto & curr_batch = curr_chunk.tokens_image->batch_f32;
+                            if (curr_batch.entries.size() != 1) {
+                                throw std::runtime_error(string_format("%s: expect 1 image in batch_f32", __func__));
+                            }
+
+                            LOG_DBG("%s: adding slice image at row %d col %d\n", __func__, y, x);
+                            cur.entries.emplace_back(std::move(curr_chunk));
+
                            add_text(ctx->tok_sli_img_end);
                            if (!is_last_in_row) {
                                add_text(ctx->tok_sli_img_mid);
@@ -1147,6 +1173,11 @@ struct mtmd_tokenizer {

            } else {

+                if (preproc_out.entries.size() == 0) {
+                    LOG_ERR("%s: no image tokens produced by preprocessor (ref: https://github.com/ggml-org/llama.cpp/pull/24769)\n", __func__);
+                    return 2;
+                }
+
                size_t n_tokens = 0;
                for (auto & e : preproc_out.entries) {
                    n_tokens += clip_n_output_tokens(ctx->ctx_v, &e);
@@ -1303,13 +1334,15 @@ struct mtmd_tokenizer {
    std::vector<mtmd_input_chunk> split_batch_to_chunk(mtmd_image_preproc_out && preproc_out, const std::string & id) {
        std::vector<mtmd_input_chunk> chunks;

-        for (auto & entry : preproc_out.entries) {
+        auto process_chunk = [&](clip_image_f32 && img) {
            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-            image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, &entry);
+            image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, &img);
            image_tokens->ny = 1;
-            image_tokens->batch_f32.entries.push_back(std::move(entry));
+            image_tokens->batch_f32.entries.push_back(std::move(img));
            image_tokens->id = id;

+            GGML_ASSERT(image_tokens->nx > 0);
+
            mtmd_input_chunk chunk{
                MTMD_INPUT_CHUNK_TYPE_IMAGE,
                {}, // text tokens
@@ -1317,6 +1350,21 @@ struct mtmd_tokenizer {
                nullptr, // audio tokens
            };
            chunks.emplace_back(std::move(chunk));
+        };
+
+        // overview image first
+        auto & overview = preproc_out.overview;
+        if (overview.nx() == 0 || overview.ny() == 0) {
+            throw std::runtime_error(string_format("%s: invalid overview image for llava-uhd style preprocessing\n", __func__));
+        }
+        process_chunk(std::move(preproc_out.overview));
+
+        // then, process slices
+        for (auto & entry : preproc_out.entries) {
+            if (entry.nx() == 0 || entry.ny() == 0) {
+                throw std::runtime_error(string_format("%s: invalid image slice for llava-uhd style preprocessing\n", __func__));
+            }
+            process_chunk(std::move(entry));
        }

        return chunks;
@@ -1390,57 +1438,22 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
        LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
        return 1;
    }
-    auto proj_type = clip_get_projector_type(ctx_clip);

    int n_embd_out = ctx->n_embd_out();
    auto n_tokens_out = image_tokens->n_tokens();
    out_embd.resize((size_t)n_embd_out * n_tokens_out);

-    bool ok = false;
-
-    if (clip_is_llava(ctx_clip)
-        || proj_type == PROJECTOR_TYPE_MINICPMV
-        || proj_type == PROJECTOR_TYPE_GLM_EDGE
-        || proj_type == PROJECTOR_TYPE_INTERNVL
-        || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2
-        || proj_type == PROJECTOR_TYPE_GRANITE4_VISION) {
-        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
-        const auto & entries = image_tokens->batch_f32.entries;
-        // entries may have different token counts
-        // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
-        size_t offset = 0;
-        for (size_t i = 0; i < entries.size(); i++) {
-            if (entries[i].is_placeholder()) {
-                LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i);
-                return 1;
-            }
-            int n_tokens_per_image = clip_n_output_tokens(ctx_clip, &entries[i]);
-            std::vector<float> tmp_embd((size_t)n_tokens_per_image * n_embd_out);
-            bool ok_i = clip_image_encode(
-                ctx_clip,
-                ctx->n_threads,
-                &entries[i],
-                tmp_embd);
-            if (!ok_i) {
-                LOG_ERR("%s: failed to encode image %zu\n", __func__, i);
-                return 1;
-            }
-            ok = true;
-            std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
-            offset += static_cast<size_t>(n_embd_out) * n_tokens_per_image;
-        }
-    } else {
-        if (image_tokens->is_placeholder()) {
-            LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
-            return 1;
-        }
-        ok = clip_image_batch_encode(
-            ctx_clip,
-            ctx->n_threads,
-            &image_tokens->batch_f32,
-            out_embd);
+    if (image_tokens->is_placeholder()) {
+        LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
+        return 1;
    }

+    bool ok = clip_image_batch_encode(
+        ctx_clip,
+        ctx->n_threads,
+        &image_tokens->batch_f32,
+        out_embd);
+
    return ok ? 0 : 1;
 }