mtmd: deepseek-ocr v1 multi-tile dynamic resolution + unified image-preprocessors for both versions (ds-ocr v1 and v2)

2026-07-05 02:30:22 +00:00 · 2026-06-15 11:57:03 +02:00
parent 74ade52741
commit d4bbef8083
9 changed files with 157 additions and 106 deletions
@@ -70,6 +70,7 @@ struct clip_hparams {
    std::vector<clip_image_size> image_res_candidates;
    int32_t preproc_min_tiles = 0;
    int32_t preproc_max_tiles = 0;
+    int32_t preproc_tile_size = 0; // local tile size (deepseek-ocr)
    resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
    resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
    pad_style image_pad_rf = PAD_CEIL;  // padding style for the refined image (e.g. llava-1.6)
@@ -1569,7 +1569,16 @@ struct clip_model_loader {
                        get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
                        get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
                        get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
+                        hparams.preproc_min_tiles = 2;
+                        if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR) {
+                            hparams.preproc_max_tiles = 9;
+                            hparams.preproc_tile_size = 640;
+                            // the CLIP/ViT body runs its layernorms at 1e-5 (the SAM stage uses 1e-6)
+                            hparams.eps = 1e-5f;
+                        }
                        if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
+                            hparams.preproc_max_tiles = 6;
+                            hparams.preproc_tile_size = 768;
                            // qwen2 encoder is GQA, requires KEY_N_HEAD_KV
                            get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv);
                        }
@@ -3182,6 +3191,16 @@ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batc
    return batch->entries[idx].get();
 }

+std::vector<float> clip_get_newline_embd(const struct clip_ctx * ctx) {
+    const ggml_tensor * nl = ctx->model.image_newline;
+    if (nl == nullptr || nl->type != GGML_TYPE_F32) {
+        return {};
+    }
+    std::vector<float> out(ggml_nelements(nl));
+    ggml_backend_tensor_get(nl, out.data(), 0, ggml_nbytes(nl));
+    return out;
+}
+
 void clip_free(clip_ctx * ctx) {
    if (ctx == nullptr) {
        return;
@@ -3222,6 +3241,9 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
            return (img->nx() / params.patch_size) / 2;
        case PROJECTOR_TYPE_STEP3VL:
            return img->nx() / (params.patch_size * params.n_merge);
+        case PROJECTOR_TYPE_DEEPSEEKOCR:
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
+            return (img->nx() / params.patch_size) / 4;
        default:
            break;
    }
@@ -3431,10 +3453,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
            // E.g., 64x64 -> 16x16 patches
            n_patches /= 16;

-            // build_global_local_features adds image newlines and view separator
-            // Formula: h*(w+1) + 1 where h = w = sqrt(n_patches)
-            int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
-            n_patches = h * (h + 1) + 1;
+            if (img->add_viewsep) {
+                // global view: one image-newline per token-row + trailing view separator
+                const int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
+                n_patches = h * (h + 1) + 1;
+            }
        } break;
        case PROJECTOR_TYPE_HUNYUANVL:
            {
@@ -97,6 +97,9 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
 size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
 struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data

+// read the image-newline embedding from the backend; empty if the model has none
+std::vector<float> clip_get_newline_embd(const struct clip_ctx * ctx);
+
 bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, std::vector<float> & out_vec);
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, std::vector<float> & out_batch_embd);

@@ -96,6 +96,8 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
    const int n_heads = hparams.sam_n_head;
    const int d_heads = n_embd / n_heads;
    const int window  = hparams.attn_window_size;
+    // SAM stage runs its layernorms at 1e-6
+    const float sam_eps = 1e-6f;

    ggml_tensor * inpL;

@@ -134,7 +136,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
        ggml_tensor * shortcut = cur;

        // layernorm1
-        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, sam_eps, il);

        const int64_t w0 = cur->ne[1];
        const int64_t h0 = cur->ne[2];
@@ -214,7 +216,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
        ggml_tensor * inpFF = cur;

        // layernorm2
-        cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+        cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, sam_eps, il);

        // ffn
        cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
@@ -229,12 +231,12 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {

    cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-    cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+    cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, sam_eps, -1);
    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));

    cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-    cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+    cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, sam_eps, -1);
    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));

    cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
@@ -303,16 +305,17 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
    cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
    cur = ggml_add(ctx0, cur, model.mm_fc_b);

-    const auto h     = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
-    const auto w     = h;
-    const auto n_dim = cur->ne[0];
+    // global view: weave one newline per row + trailing view separator
+    if (img.add_viewsep) {
+        const auto h     = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
+        const auto w     = h;
+        const auto n_dim = cur->ne[0];

-    ggml_tensor * imgnl;
-
-    imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
-    cur   = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
-    cur   = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
-    cur   = ggml_concat(ctx0, cur, model.view_seperator, 1);  // (n_dim, h*(w+1) + 1)
+        ggml_tensor * imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
+        cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
+        cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
+        cur = ggml_concat(ctx0, cur, model.view_seperator, 1);  // (n_dim, h*(w+1) + 1)
+    }

    cb(cur, "dsocr_output", -1);

@@ -1112,46 +1112,7 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
 // mtmd_image_preprocessor_deepseekocr
 //

-bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
-    static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
-    // TODO: support 512 (tiny) and 640 (small) once we have eval data for them
-
-    const int64_t orig_area = static_cast<int64_t>(img.get_size().area());
-
-    size_t  mode_i   = 0;
-    int64_t min_diff = std::numeric_limits<int64_t>::max();
-    for (size_t i = 0; i < std::size(native_resolutions); i++) {
-        const int64_t r    = native_resolutions[i];
-        const int64_t diff = std::abs(orig_area - r * r);
-        if (diff < min_diff) {
-            mode_i   = i;
-            min_diff = diff;
-        }
-    }
-    const int image_size = native_resolutions[mode_i];
-
-    // Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for
-    // byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor.
-    clip_image_u8 padded;
-    img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
-                     PAD_NEAREST, hparams.image_pad_color);
-
-    clip_image_f32_ptr res(clip_image_f32_init());
-    img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std);
-    output.entries.push_back(std::move(res));
-
-    output.grid_x = 1;
-    output.grid_y = 1;
-    return true;
-}
-
-//
-// mtmd_image_preprocessor_deepseekocr2
-//
-
-// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles
-// sorted by tile count
-std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ratios() {
+std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr::get_target_ratios() const {
    std::vector<clip_image_size> ratios;
    for (int n = min_tiles; n <= max_tiles; n++) {
        for (int w = 1; w <= n; w++) {
@@ -1178,13 +1139,11 @@ std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ra
    return ratios;
 }

-// pick the grid whose aspect ratio is closest to the image
-// on a tie, prefer the larger grid when the image fits
-clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
+clip_image_size mtmd_image_preprocessor_deepseekocr::find_closest_aspect_ratio(
    float                                aspect_ratio,
    const std::vector<clip_image_size> & target_ratios,
    int                                  width,
-    int                                  height) {
+    int                                  height) const {
    float           best_ratio_diff = std::numeric_limits<float>::max();
    clip_image_size best_ratio      = { 1, 1 };
    const float     area            = static_cast<float>(width * height);
@@ -1205,23 +1164,26 @@ clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
    return best_ratio;
 }

-bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
-    // emit 768x768 local tiles when the image is larger than a tile in either
-    // dimension, then always a 1024x1024 global view. order: [tiles..., global].
+bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    // output order: [local tiles..., global]

+    int        grid_w   = 1;
+    int        grid_h   = 1;
    const auto img_size = img.get_size();
    if (img_size.width > tile_size || img_size.height > tile_size) {
        const float           aspect_ratio  = static_cast<float>(img_size.width) / img_size.height;
        const auto            target_ratios = get_target_ratios();
-        const clip_image_size grid          = find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height);
+        const clip_image_size grid =
+            find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height);
+        grid_w = grid.width;
+        grid_h = grid.height;

-        // stretch onto the grid (no aspect preserve), then crop tiles row-major.
        clip_image_u8 refined;
-        img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height },
-                         RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE);
+        img_tool::resize(img, refined, { tile_size * grid_w, tile_size * grid_h }, RESIZE_ALGO_BICUBIC_PILLOW,
+                         PAD_NONE);

-        for (int row = 0; row < grid.height; row++) {
-            for (int col = 0; col < grid.width; col++) {
+        for (int row = 0; row < grid_h; row++) {
+            for (int col = 0; col < grid_w; col++) {
                clip_image_u8 tile;
                img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size);
                clip_image_f32_ptr res(clip_image_f32_init());
@@ -1231,17 +1193,17 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img,
        }
    }

-    // global view: aspect-preserving fit-and-pad to base_size.
+    // global view: aspect-preserving fit-and-pad to base_size
    clip_image_u8 padded;
-    img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
-                     PAD_NEAREST, hparams.image_pad_color);
+    img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW, PAD_NEAREST,
+                     hparams.image_pad_color);
    clip_image_f32_ptr global(clip_image_f32_init());
    img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std);
    global->add_viewsep = true;
    output.entries.push_back(std::move(global));
-
-    output.grid_x = 1;
-    output.grid_y = 1;
+    output.grid_x = grid_w;
+    output.grid_y = grid_h;
+    LOG_DBG("%s: grid size: %d x %d (%d tiles) + global view\n", __func__, grid_w, grid_h, grid_w * grid_h);
    return true;
 }

@@ -139,29 +139,27 @@ struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };

+// DeepSeek-OCR (v1/v2) global view + optional local tile grid
 struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
-    mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
-};
-
-// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local
-// tiles when the image is larger than a tile in either dimension.
-struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor {
-    static constexpr int base_size = 1024; // global view
-    static constexpr int tile_size = 768;  // local tile
-    static constexpr int min_tiles = 2;
-    static constexpr int max_tiles = 6;
-
-    mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
+    mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx)
+        : mtmd_image_preprocessor(ctx),
+          base_size(hparams.image_size),
+          tile_size(hparams.preproc_tile_size),
+          min_tiles(hparams.preproc_min_tiles),
+          max_tiles(hparams.preproc_max_tiles) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;

 private:
-    static std::vector<clip_image_size> get_target_ratios();
-    static clip_image_size              find_closest_aspect_ratio(
-        float                                aspect_ratio,
-        const std::vector<clip_image_size> & target_ratios,
-        int                                  width,
-        int                                  height);
+    int base_size; // global view
+    int tile_size; // each tile
+    int min_tiles;
+    int max_tiles;
+
+    std::vector<clip_image_size> get_target_ratios() const;
+    clip_image_size find_closest_aspect_ratio(
+            float aspect_ratio,
+            const std::vector<clip_image_size> & target_ratios,
+            int width, int height) const;
 };

 // custom image preprocessing for Step3VL
@@ -612,14 +612,10 @@ struct mtmd_context {
                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
                } break;
            case PROJECTOR_TYPE_DEEPSEEKOCR:
-                {
-                    img_end = "\n"; // prevent empty batch on llama-server
-                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
-                } break;
            case PROJECTOR_TYPE_DEEPSEEKOCR2:
                {
                    img_end = "\n"; // prevent empty batch on llama-server
-                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                } break;
            case PROJECTOR_TYPE_HUNYUANVL:
                {
@@ -1169,11 +1165,18 @@ struct mtmd_tokenizer {
            } else {

                size_t n_tokens = 0;
-                for (const auto & e : batch_f32.entries) {
-                    n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
-                    if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) {
-                        // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
-                        break;
+                if (ctx->proj_type_v() == PROJECTOR_TYPE_DEEPSEEKOCR && batch_f32.entries.size() > 1) {
+                    // v1 weaves the local tiles into a grid (one image-newline per token-row), then the global view
+                    const int h = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
+                    n_tokens  = (h * batch_f32.grid_x + 1) * (h * batch_f32.grid_y);
+                    n_tokens += clip_n_output_tokens(ctx->ctx_v, batch_f32.entries.back().get());
+                } else {
+                    for (const auto & e : batch_f32.entries) {
+                        n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
+                        if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) {
+                            // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
+                            break;
+                        }
                    }
                }

@@ -1399,6 +1402,38 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
    }
 }

+// Stitch the tiles in raw, one newline per token-row, append the overview (raw's last chunk).
+// Example, 2x2 grid of tiles A B / C D:
+//   raw = [ A B C D <overview> ]
+//   out = A.row0 B.row0 n,  A.row1 B.row1 n,  ...,  C.row0 D.row0 n,  ...,  <overview>
+static void stitch_tile_grid(clip_ctx * ctx, const clip_image_f32_batch & batch,
+                             const std::vector<float> & raw, int n_embd, float * out) {
+    const auto &  entries = batch.entries;
+    const int     n_tiles = static_cast<int>(entries.size()) - 1; // overview is last
+    GGML_ASSERT(n_tiles == batch.grid_x * batch.grid_y);
+    const int     tile_h  = clip_n_output_tokens_x(ctx, entries[0].get());
+    const size_t  row_sz  = static_cast<size_t>(tile_h) * n_embd;
+    const size_t  tile_sz = static_cast<size_t>(tile_h) * row_sz;
+    const std::vector<float> newline = clip_get_newline_embd(ctx);
+    GGML_ASSERT(!newline.empty());
+
+    for (int r = 0; r < batch.grid_y; r++) {
+        for (int pr = 0; pr < tile_h; pr++) {
+            for (int c = 0; c < batch.grid_x; c++) {
+                const float * tile = raw.data() + static_cast<size_t>(r * batch.grid_x + c) * tile_sz;
+                memcpy(out, tile + static_cast<size_t>(pr) * row_sz, row_sz * sizeof(float));
+                out += row_sz;
+            }
+            memcpy(out, newline.data(), static_cast<size_t>(n_embd) * sizeof(float));
+            out += n_embd;
+        }
+    }
+    // overview = raw's last encoded chunk; size it from the entry, not raw.size() (raw is over-allocated)
+    const size_t global_off = static_cast<size_t>(n_tiles) * tile_sz;
+    const size_t global_sz  = static_cast<size_t>(clip_n_output_tokens(ctx, entries.back().get())) * n_embd;
+    memcpy(out, raw.data() + global_off, global_sz * sizeof(float));
+}
+
 static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> & out_embd) {
    clip_ctx * ctx_clip = ctx->ctx_v;
    if (!ctx_clip) {
@@ -1417,12 +1452,17 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
        || proj_type == PROJECTOR_TYPE_MINICPMV
        || proj_type == PROJECTOR_TYPE_GLM_EDGE
        || proj_type == PROJECTOR_TYPE_INTERNVL
+        || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR
        || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2
        || proj_type == PROJECTOR_TYPE_GRANITE4_VISION) {
        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
        const auto & entries = image_tokens->batch_f32.entries;
        // entries may have different token counts
        // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
+        // DeepSeek-OCR v1, when multi-view, weaves its tiles into a grid (see stitch_tile_grid)
+        const bool is_dsocr_mlt = proj_type == PROJECTOR_TYPE_DEEPSEEKOCR && entries.size() > 1;
+        std::vector<float> raw(is_dsocr_mlt ? static_cast<size_t>(n_embd_out) * n_tokens_out : 0);
+        float * dst = is_dsocr_mlt ? raw.data() : out_embd.data();
        size_t offset = 0;
        for (size_t i = 0; i < entries.size(); i++) {
            if (entries[i]->is_placeholder()) {
@@ -1441,9 +1481,12 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
                return 1;
            }
            ok = true;
-            std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
+            std::copy(tmp_embd.begin(), tmp_embd.end(), dst + offset);
            offset += static_cast<size_t>(n_embd_out) * n_tokens_per_image;
        }
+        if (is_dsocr_mlt) {
+            stitch_tile_grid(ctx_clip, image_tokens->batch_f32, raw, n_embd_out, out_embd.data());
+        }
    } else {
        if (image_tokens->is_placeholder()) {
            LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
@@ -82,6 +82,24 @@ CASES = [
        # is one pixel off and lands at ~0.69 instead.
        hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
    ),
+    TestCase(
+        model_key="v1", label="multi-tile (dynamic resolution)",
+        image="tools/mtmd/tests/test-1-positive.png",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        # 429x806 -- 806 > 640 triggers the v1 "Gundam" path: (1,2) grid ->
+        # 2 local 640 tiles + 1 global 1024 view. Regression guard for the
+        # tiling preprocessor -- a broken tile path craters the score.
+        # hf_cer/hf_chrf are HF v1's measured scores -- it reads this clean crop exactly.
+        hf_cer=0.0000, hf_chrf=100.00, cer_tol=0.03, chrf_tol=3.0,
+    ),
+    TestCase(
+        model_key="v2", label="multi-tile (dynamic resolution)",
+        image="tools/mtmd/tests/test-1-positive.png",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        # 429x806 -- 806 > 768 triggers the v2 path: (1,2) grid ->
+        # 2 local 768 tiles + 1 global 1024 view = 545 image tokens.
+        hf_cer=0.0236, hf_chrf=97.05, cer_tol=0.03, chrf_tol=3.0,
+    ),
 ]