diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index 48796b6306..08ed0b3412 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -70,6 +70,7 @@ struct clip_hparams {
     std::vector<clip_image_size> image_res_candidates;
     int32_t preproc_min_tiles = 0;
     int32_t preproc_max_tiles = 0;
+    int32_t preproc_tile_size = 0; // local tile size (deepseek-ocr)
     resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
     resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
     pad_style image_pad_rf = PAD_CEIL;  // padding style for the refined image (e.g. llava-1.6)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 208486fd15..f83def0854 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1569,7 +1569,16 @@ struct clip_model_loader {
                         get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
                         get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
                         get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
+                        hparams.preproc_min_tiles = 2;
+                        if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR) {
+                            hparams.preproc_max_tiles = 9;
+                            hparams.preproc_tile_size = 640;
+                            // the CLIP/ViT body runs its layernorms at 1e-5 (the SAM stage uses 1e-6)
+                            hparams.eps = 1e-5f;
+                        }
                         if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
+                            hparams.preproc_max_tiles = 6;
+                            hparams.preproc_tile_size = 768;
                             // qwen2 encoder is GQA, requires KEY_N_HEAD_KV
                             get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv);
                         }
@@ -3182,6 +3191,16 @@ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batc
     return batch->entries[idx].get();
 }
 
+std::vector<float> clip_get_newline_embd(const struct clip_ctx * ctx) {
+    const ggml_tensor * nl = ctx->model.image_newline;
+    if (nl == nullptr || nl->type != GGML_TYPE_F32) {
+        return {};
+    }
+    std::vector<float> out(ggml_nelements(nl));
+    ggml_backend_tensor_get(nl, out.data(), 0, ggml_nbytes(nl));
+    return out;
+}
+
 void clip_free(clip_ctx * ctx) {
     if (ctx == nullptr) {
         return;
@@ -3222,6 +3241,9 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
             return (img->nx() / params.patch_size) / 2;
         case PROJECTOR_TYPE_STEP3VL:
             return img->nx() / (params.patch_size * params.n_merge);
+        case PROJECTOR_TYPE_DEEPSEEKOCR:
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
+            return (img->nx() / params.patch_size) / 4;
         default:
             break;
     }
@@ -3431,10 +3453,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
             // E.g., 64x64 -> 16x16 patches
             n_patches /= 16;
 
-            // build_global_local_features adds image newlines and view separator
-            // Formula: h*(w+1) + 1 where h = w = sqrt(n_patches)
-            int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
-            n_patches = h * (h + 1) + 1;
+            if (img->add_viewsep) {
+                // global view: one image-newline per token-row + trailing view separator
+                const int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
+                n_patches = h * (h + 1) + 1;
+            }
         } break;
         case PROJECTOR_TYPE_HUNYUANVL:
             {
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 7197af8569..7b1a98a9aa 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -97,6 +97,9 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
 size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
 struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
 
+// read the image-newline embedding from the backend; empty if the model has none
+std::vector<float> clip_get_newline_embd(const struct clip_ctx * ctx);
+
 bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, std::vector<float> & out_vec);
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, std::vector<float> & out_batch_embd);
 
diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp
index c3c22d0a4b..fbd4e2f730 100644
--- a/tools/mtmd/models/deepseekocr.cpp
+++ b/tools/mtmd/models/deepseekocr.cpp
@@ -96,6 +96,8 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
     const int n_heads = hparams.sam_n_head;
     const int d_heads = n_embd / n_heads;
     const int window  = hparams.attn_window_size;
+    // SAM stage runs its layernorms at 1e-6
+    const float sam_eps = 1e-6f;
 
     ggml_tensor * inpL;
 
@@ -134,7 +136,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
         ggml_tensor * shortcut = cur;
 
         // layernorm1
-        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, sam_eps, il);
 
         const int64_t w0 = cur->ne[1];
         const int64_t h0 = cur->ne[2];
@@ -214,7 +216,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
         ggml_tensor * inpFF = cur;
 
         // layernorm2
-        cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+        cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, sam_eps, il);
 
         // ffn
         cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
@@ -229,12 +231,12 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
 
     cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
     cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-    cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+    cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, sam_eps, -1);
     cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
 
     cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
     cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-    cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
+    cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, sam_eps, -1);
     cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
 
     cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
@@ -303,16 +305,17 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
     cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
     cur = ggml_add(ctx0, cur, model.mm_fc_b);
 
-    const auto h     = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
-    const auto w     = h;
-    const auto n_dim = cur->ne[0];
+    // global view: weave one newline per row + trailing view separator
+    if (img.add_viewsep) {
+        const auto h     = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
+        const auto w     = h;
+        const auto n_dim = cur->ne[0];
 
-    ggml_tensor * imgnl;
-
-    imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
-    cur   = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
-    cur   = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
-    cur   = ggml_concat(ctx0, cur, model.view_seperator, 1);  // (n_dim, h*(w+1) + 1)
+        ggml_tensor * imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
+        cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
+        cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
+        cur = ggml_concat(ctx0, cur, model.view_seperator, 1);  // (n_dim, h*(w+1) + 1)
+    }
 
     cb(cur, "dsocr_output", -1);
 
diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp
index bedf44e07c..dd83515e42 100644
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
@@ -1112,46 +1112,7 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
 // mtmd_image_preprocessor_deepseekocr
 //
 
-bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
-    static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
-    // TODO: support 512 (tiny) and 640 (small) once we have eval data for them
-
-    const int64_t orig_area = static_cast<int64_t>(img.get_size().area());
-
-    size_t  mode_i   = 0;
-    int64_t min_diff = std::numeric_limits<int64_t>::max();
-    for (size_t i = 0; i < std::size(native_resolutions); i++) {
-        const int64_t r    = native_resolutions[i];
-        const int64_t diff = std::abs(orig_area - r * r);
-        if (diff < min_diff) {
-            mode_i   = i;
-            min_diff = diff;
-        }
-    }
-    const int image_size = native_resolutions[mode_i];
-
-    // Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for
-    // byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor.
-    clip_image_u8 padded;
-    img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
-                     PAD_NEAREST, hparams.image_pad_color);
-
-    clip_image_f32_ptr res(clip_image_f32_init());
-    img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std);
-    output.entries.push_back(std::move(res));
-
-    output.grid_x = 1;
-    output.grid_y = 1;
-    return true;
-}
-
-//
-// mtmd_image_preprocessor_deepseekocr2
-//
-
-// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles
-// sorted by tile count
-std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ratios() {
+std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr::get_target_ratios() const {
     std::vector<clip_image_size> ratios;
     for (int n = min_tiles; n <= max_tiles; n++) {
         for (int w = 1; w <= n; w++) {
@@ -1178,13 +1139,11 @@ std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ra
     return ratios;
 }
 
-// pick the grid whose aspect ratio is closest to the image
-// on a tie, prefer the larger grid when the image fits
-clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
+clip_image_size mtmd_image_preprocessor_deepseekocr::find_closest_aspect_ratio(
     float                                aspect_ratio,
     const std::vector<clip_image_size> & target_ratios,
     int                                  width,
-    int                                  height) {
+    int                                  height) const {
     float           best_ratio_diff = std::numeric_limits<float>::max();
     clip_image_size best_ratio      = { 1, 1 };
     const float     area            = static_cast<float>(width * height);
@@ -1205,23 +1164,26 @@ clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
     return best_ratio;
 }
 
-bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
-    // emit 768x768 local tiles when the image is larger than a tile in either
-    // dimension, then always a 1024x1024 global view. order: [tiles..., global].
+bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    // output order: [local tiles..., global]
 
+    int        grid_w   = 1;
+    int        grid_h   = 1;
     const auto img_size = img.get_size();
     if (img_size.width > tile_size || img_size.height > tile_size) {
         const float           aspect_ratio  = static_cast<float>(img_size.width) / img_size.height;
         const auto            target_ratios = get_target_ratios();
-        const clip_image_size grid          = find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height);
+        const clip_image_size grid =
+            find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height);
+        grid_w = grid.width;
+        grid_h = grid.height;
 
-        // stretch onto the grid (no aspect preserve), then crop tiles row-major.
         clip_image_u8 refined;
-        img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height },
-                         RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE);
+        img_tool::resize(img, refined, { tile_size * grid_w, tile_size * grid_h }, RESIZE_ALGO_BICUBIC_PILLOW,
+                         PAD_NONE);
 
-        for (int row = 0; row < grid.height; row++) {
-            for (int col = 0; col < grid.width; col++) {
+        for (int row = 0; row < grid_h; row++) {
+            for (int col = 0; col < grid_w; col++) {
                 clip_image_u8 tile;
                 img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size);
                 clip_image_f32_ptr res(clip_image_f32_init());
@@ -1231,17 +1193,17 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img,
         }
     }
 
-    // global view: aspect-preserving fit-and-pad to base_size.
+    // global view: aspect-preserving fit-and-pad to base_size
     clip_image_u8 padded;
-    img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
-                     PAD_NEAREST, hparams.image_pad_color);
+    img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW, PAD_NEAREST,
+                     hparams.image_pad_color);
     clip_image_f32_ptr global(clip_image_f32_init());
     img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std);
     global->add_viewsep = true;
     output.entries.push_back(std::move(global));
-
-    output.grid_x = 1;
-    output.grid_y = 1;
+    output.grid_x = grid_w;
+    output.grid_y = grid_h;
+    LOG_DBG("%s: grid size: %d x %d (%d tiles) + global view\n", __func__, grid_w, grid_h, grid_w * grid_h);
     return true;
 }
 
diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h
index 91a5bc253e..5ff4a3789f 100644
--- a/tools/mtmd/mtmd-image.h
+++ b/tools/mtmd/mtmd-image.h
@@ -139,29 +139,27 @@ struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
     bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
 
+// DeepSeek-OCR (v1/v2) global view + optional local tile grid
 struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
-    mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
-};
-
-// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local
-// tiles when the image is larger than a tile in either dimension.
-struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor {
-    static constexpr int base_size = 1024; // global view
-    static constexpr int tile_size = 768;  // local tile
-    static constexpr int min_tiles = 2;
-    static constexpr int max_tiles = 6;
-
-    mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
+    mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx)
+        : mtmd_image_preprocessor(ctx),
+          base_size(hparams.image_size),
+          tile_size(hparams.preproc_tile_size),
+          min_tiles(hparams.preproc_min_tiles),
+          max_tiles(hparams.preproc_max_tiles) {}
     bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 
 private:
-    static std::vector<clip_image_size> get_target_ratios();
-    static clip_image_size              find_closest_aspect_ratio(
-        float                                aspect_ratio,
-        const std::vector<clip_image_size> & target_ratios,
-        int                                  width,
-        int                                  height);
+    int base_size; // global view
+    int tile_size; // each tile
+    int min_tiles;
+    int max_tiles;
+
+    std::vector<clip_image_size> get_target_ratios() const;
+    clip_image_size find_closest_aspect_ratio(
+            float aspect_ratio,
+            const std::vector<clip_image_size> & target_ratios,
+            int width, int height) const;
 };
 
 // custom image preprocessing for Step3VL
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index ad709227f7..9034b3496f 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -612,14 +612,10 @@ struct mtmd_context {
                     image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
                 } break;
             case PROJECTOR_TYPE_DEEPSEEKOCR:
-                {
-                    img_end = "\n"; // prevent empty batch on llama-server
-                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
-                } break;
             case PROJECTOR_TYPE_DEEPSEEKOCR2:
                 {
                     img_end = "\n"; // prevent empty batch on llama-server
-                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                 } break;
             case PROJECTOR_TYPE_HUNYUANVL:
                 {
@@ -1169,11 +1165,18 @@ struct mtmd_tokenizer {
             } else {
 
                 size_t n_tokens = 0;
-                for (const auto & e : batch_f32.entries) {
-                    n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
-                    if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) {
-                        // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
-                        break;
+                if (ctx->proj_type_v() == PROJECTOR_TYPE_DEEPSEEKOCR && batch_f32.entries.size() > 1) {
+                    // v1 weaves the local tiles into a grid (one image-newline per token-row), then the global view
+                    const int h = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
+                    n_tokens  = (h * batch_f32.grid_x + 1) * (h * batch_f32.grid_y);
+                    n_tokens += clip_n_output_tokens(ctx->ctx_v, batch_f32.entries.back().get());
+                } else {
+                    for (const auto & e : batch_f32.entries) {
+                        n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
+                        if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) {
+                            // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
+                            break;
+                        }
                     }
                 }
 
@@ -1399,6 +1402,38 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
     }
 }
 
+// Stitch the tiles in raw, one newline per token-row, append the overview (raw's last chunk).
+// Example, 2x2 grid of tiles A B / C D:
+//   raw = [ A B C D <overview> ]
+//   out = A.row0 B.row0 n,  A.row1 B.row1 n,  ...,  C.row0 D.row0 n,  ...,  <overview>
+static void stitch_tile_grid(clip_ctx * ctx, const clip_image_f32_batch & batch,
+                             const std::vector<float> & raw, int n_embd, float * out) {
+    const auto &  entries = batch.entries;
+    const int     n_tiles = static_cast<int>(entries.size()) - 1; // overview is last
+    GGML_ASSERT(n_tiles == batch.grid_x * batch.grid_y);
+    const int     tile_h  = clip_n_output_tokens_x(ctx, entries[0].get());
+    const size_t  row_sz  = static_cast<size_t>(tile_h) * n_embd;
+    const size_t  tile_sz = static_cast<size_t>(tile_h) * row_sz;
+    const std::vector<float> newline = clip_get_newline_embd(ctx);
+    GGML_ASSERT(!newline.empty());
+
+    for (int r = 0; r < batch.grid_y; r++) {
+        for (int pr = 0; pr < tile_h; pr++) {
+            for (int c = 0; c < batch.grid_x; c++) {
+                const float * tile = raw.data() + static_cast<size_t>(r * batch.grid_x + c) * tile_sz;
+                memcpy(out, tile + static_cast<size_t>(pr) * row_sz, row_sz * sizeof(float));
+                out += row_sz;
+            }
+            memcpy(out, newline.data(), static_cast<size_t>(n_embd) * sizeof(float));
+            out += n_embd;
+        }
+    }
+    // overview = raw's last encoded chunk; size it from the entry, not raw.size() (raw is over-allocated)
+    const size_t global_off = static_cast<size_t>(n_tiles) * tile_sz;
+    const size_t global_sz  = static_cast<size_t>(clip_n_output_tokens(ctx, entries.back().get())) * n_embd;
+    memcpy(out, raw.data() + global_off, global_sz * sizeof(float));
+}
+
 static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> & out_embd) {
     clip_ctx * ctx_clip = ctx->ctx_v;
     if (!ctx_clip) {
@@ -1417,12 +1452,17 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
         || proj_type == PROJECTOR_TYPE_MINICPMV
         || proj_type == PROJECTOR_TYPE_GLM_EDGE
         || proj_type == PROJECTOR_TYPE_INTERNVL
+        || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR
         || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2
         || proj_type == PROJECTOR_TYPE_GRANITE4_VISION) {
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
         // entries may have different token counts
         // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
+        // DeepSeek-OCR v1, when multi-view, weaves its tiles into a grid (see stitch_tile_grid)
+        const bool is_dsocr_mlt = proj_type == PROJECTOR_TYPE_DEEPSEEKOCR && entries.size() > 1;
+        std::vector<float> raw(is_dsocr_mlt ? static_cast<size_t>(n_embd_out) * n_tokens_out : 0);
+        float * dst = is_dsocr_mlt ? raw.data() : out_embd.data();
         size_t offset = 0;
         for (size_t i = 0; i < entries.size(); i++) {
             if (entries[i]->is_placeholder()) {
@@ -1441,9 +1481,12 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
                 return 1;
             }
             ok = true;
-            std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
+            std::copy(tmp_embd.begin(), tmp_embd.end(), dst + offset);
             offset += static_cast<size_t>(n_embd_out) * n_tokens_per_image;
         }
+        if (is_dsocr_mlt) {
+            stitch_tile_grid(ctx_clip, image_tokens->batch_f32, raw, n_embd_out, out_embd.data());
+        }
     } else {
         if (image_tokens->is_placeholder()) {
             LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
diff --git a/tools/mtmd/tests/test-1-positive.png b/tools/mtmd/tests/test-1-positive.png
new file mode 100644
index 0000000000..007614594e
Binary files /dev/null and b/tools/mtmd/tests/test-1-positive.png differ
diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py
index 5f5fef765a..f641045355 100644
--- a/tools/mtmd/tests/test-deepseek-ocr.py
+++ b/tools/mtmd/tests/test-deepseek-ocr.py
@@ -82,6 +82,24 @@ CASES = [
         # is one pixel off and lands at ~0.69 instead.
         hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
     ),
+    TestCase(
+        model_key="v1", label="multi-tile (dynamic resolution)",
+        image="tools/mtmd/tests/test-1-positive.png",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        # 429x806 -- 806 > 640 triggers the v1 "Gundam" path: (1,2) grid ->
+        # 2 local 640 tiles + 1 global 1024 view. Regression guard for the
+        # tiling preprocessor -- a broken tile path craters the score.
+        # hf_cer/hf_chrf are HF v1's measured scores -- it reads this clean crop exactly.
+        hf_cer=0.0000, hf_chrf=100.00, cer_tol=0.03, chrf_tol=3.0,
+    ),
+    TestCase(
+        model_key="v2", label="multi-tile (dynamic resolution)",
+        image="tools/mtmd/tests/test-1-positive.png",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        # 429x806 -- 806 > 768 triggers the v2 path: (1,2) grid ->
+        # 2 local 768 tiles + 1 global 1024 view = 545 image tokens.
+        hf_cer=0.0236, hf_chrf=97.05, cer_tol=0.03, chrf_tol=3.0,
+    ),
 ]