diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp
index a807fefda3..57afb542d4 100644
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
@@ -26,6 +26,13 @@ void mtmd_image_preproc_out::append(const clip_hparams & hparams, clip_image_f32
     entries.push_back(std::move(img));
 }
 
+void mtmd_image_preproc_out::append_overview(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized) {
+    overview.from_u8(img);
+    if (normalized) {
+        overview.normalize(hparams.image_mean, hparams.image_std);
+    }
+}
+
 // set of tools to manipulate images
 // in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
 struct img_tool {
@@ -607,10 +614,11 @@ private:
 mtmd_image_preproc_out mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img) {
     const clip_image_size original_size = img.get_size();
     auto const inst = get_slice_instructions(original_size);
-    std::vector<clip_image_u8> imgs = slice_image(img, inst);
+    auto sliced = slice_image(img, inst);
 
     mtmd_image_preproc_out output;
-    output.append(hparams, imgs, true);
+    output.append_overview(hparams, sliced.overview, true);
+    output.append(hparams, sliced.slices, true);
     output.grid_x = inst.grid_size.width;
     output.grid_y = inst.grid_size.height;
 
@@ -722,22 +730,15 @@ mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_ll
     return res;
 }
 
-std::vector<clip_image_u8> mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst, bool overview_first) {
-    std::vector<clip_image_u8> output;
+mtmd_image_preprocessor_llava_uhd::slice_output mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst) {
+    slice_output output;
 
     // resize to overview size
-    clip_image_u8 resized_img;
-    img_tool::resize(img, resized_img, inst.overview_size, hparams.image_resize_algo_ov,
+    img_tool::resize(img, output.overview, inst.overview_size, hparams.image_resize_algo_ov,
                         hparams.image_pad_ov, hparams.image_pad_color_ov);
-    if (overview_first) {
-        output.push_back(resized_img);
-    }
 
     if (inst.slices.empty()) {
-        // no slices, just return the resized image
-        if (!overview_first) {
-            output.push_back(resized_img);
-        }
+        // no slices, just return the overview image
         return output;
     }
 
@@ -755,11 +756,7 @@ std::vector<clip_image_u8> mtmd_image_preprocessor_llava_uhd::slice_image(const
 
         clip_image_u8 img_slice;
         img_tool::crop(refined_img, img_slice, x, y, w, h);
-        output.push_back(std::move(img_slice));
-    }
-
-    if (!overview_first) {
-        output.push_back(resized_img);
+        output.slices.push_back(std::move(img_slice));
     }
 
     return output;
@@ -1077,10 +1074,11 @@ mtmd_image_preproc_out mtmd_image_preprocessor_idefics3::preprocess(const clip_i
             });
         }
     }
-    auto imgs = slice_image(img, instructions);
+    auto sliced = slice_image(img, instructions);
 
     mtmd_image_preproc_out output;
-    output.append(hparams, imgs, true);
+    output.append_overview(hparams, sliced.overview, true);
+    output.append(hparams, sliced.slices, true);
     output.grid_x = instructions.grid_size.width;
     output.grid_y = instructions.grid_size.height;
     return output;
@@ -1094,10 +1092,12 @@ mtmd_image_preproc_out mtmd_image_preprocessor_internvl::preprocess(const clip_i
     GGML_ASSERT(!hparams.image_res_candidates.empty());
     const clip_image_size original_size = img.get_size();
     auto const inst = get_slice_instructions(original_size);
-    std::vector<clip_image_u8> imgs = slice_image(img, inst, false);
+    auto sliced = slice_image(img, inst);
 
     mtmd_image_preproc_out output;
-    output.append(hparams, imgs, true);
+    // InternVL: slices first, then overview
+    output.append(hparams, sliced.slices, true);
+    output.append_overview(hparams, sliced.overview, true);
     output.grid_x = inst.grid_size.width;
     output.grid_y = inst.grid_size.height;
     return output;
@@ -1131,9 +1131,10 @@ mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr::preprocess(const cli
     img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
                      PAD_NEAREST, hparams.image_pad_color);
     mtmd_image_preproc_out output;
-    output.append(hparams, padded, true);
-    output.grid_x = 1;
-    output.grid_y = 1;
+    output.append_overview(hparams, padded, true);
+    output.grid_x = 0;
+    output.grid_y = 0;
+    // TODO @ngxson : support slicing for DeepSeek-OCR, to do in another PR
     return output;
 }
 
@@ -1226,10 +1227,8 @@ mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr2::preprocess(const cl
     clip_image_u8 padded;
     img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
                      PAD_NEAREST, hparams.image_pad_color);
-    output.append(hparams, padded, true);
-    output.entries.back().add_viewsep = true;
-    output.grid_x = 1;
-    output.grid_y = 1;
+    output.append_overview(hparams, padded, true);
+    output.overview.add_viewsep = true;
     return output;
 }
 
@@ -1447,15 +1446,14 @@ mtmd_image_preproc_out mtmd_image_preprocessor_step3vl::preprocess(const clip_im
     const auto instructions = build_slice_instructions(hparams, prepared.get_size());
 
     mtmd_image_preproc_out output;
-    clip_image_f32 overview_f32;
+    // overview (normalized f32, already includes mean/std)
     img_u8_resize_bilinear_to_f32(
         prepared,
-        overview_f32,
+        output.overview,
         hparams.image_size,
         hparams.image_size,
         hparams.image_mean,
         hparams.image_std);
-    output.append(hparams, overview_f32, false);
 
     if (instructions.slices.empty()) {
         output.grid_x = 0;
@@ -1548,13 +1546,13 @@ mtmd_image_preproc_out mtmd_image_preprocessor_youtuvl::preprocess(const clip_im
 
 mtmd_image_preproc_out mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img) {
     auto output = mtmd_image_preprocessor_llava_uhd::preprocess(img);
-    if (output.entries.size() == 1) {
+    if (output.entries.size() == 0) {
         // Single-tile (overview only): append one newline row.
-        output.entries[0].add_newline = true;
+        output.overview.add_newline = true;
     } else {
         // Multi-tile: overview gets no newline, grid tiles get one.
-        output.entries[0].add_newline = false;
-        for (size_t i = 1; i < output.entries.size(); ++i) {
+        output.overview.add_newline = false;
+        for (size_t i = 0; i < output.entries.size(); ++i) {
             output.entries[i].add_newline = true;
         }
     }
diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h
index 8819a135a1..f458e39e76 100644
--- a/tools/mtmd/mtmd-image.h
+++ b/tools/mtmd/mtmd-image.h
@@ -11,11 +11,19 @@
 struct mtmd_image_preproc_out {
     std::vector<clip_image_f32> entries;
     // grid size is required for llava-uhd style models
+
+    clip_image_f32 overview; // overview image (downscaled image)
     int grid_x = 0;
     int grid_y = 0;
+
     void append(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized = true);
     void append(const clip_hparams & hparams, const std::vector<clip_image_u8> & imgs, bool normalized = true);
     void append(const clip_hparams & hparams, clip_image_f32 & img, bool normalized = true);
+
+    void append_overview(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized = true);
+    bool has_overview() const {
+        return overview.nx() > 0 || overview.ny() > 0;
+    }
 };
 
 // base class, models must inherit from this class
@@ -46,6 +54,8 @@ struct mtmd_image_preprocessor {
  * [overview] --> [slice 1] --> [slice 2]
  *           |                |
  *           +--> [slice 3] --> [slice 4]
+ *
+ * NOTE: for the ordering of overview, set "ov_img_first" on the mtmd_context
  */
 struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
     mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
@@ -67,7 +77,11 @@ struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
     // LFM2 override this function to implement its custom slicing logic
     virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);
 
-    std::vector<clip_image_u8> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);
+    struct slice_output {
+        clip_image_u8 overview;
+        std::vector<clip_image_u8> slices;
+    };
+    slice_output slice_image(const clip_image_u8 & img, const slice_instructions & inst);
 
 private:
     clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index f9ee021ddb..abba2ebf2c 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -516,6 +516,7 @@ struct mtmd_context {
                     LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
                             "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
                     image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
+                    ov_img_first = false;
                 } break;
             case PROJECTOR_TYPE_STEP3VL:
                 {
@@ -539,6 +540,7 @@ struct mtmd_context {
                     img_beg = "<img>";
                     img_end = "</img>";
                     image_preproc = std::make_unique<mtmd_image_preprocessor_internvl>(ctx_v);
+                    ov_img_first = false;
                 } break;
             case PROJECTOR_TYPE_KIMIVL:
                 {
@@ -615,11 +617,13 @@ struct mtmd_context {
                 {
                     img_end = "\n"; // prevent empty batch on llama-server
                     image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
+                    ov_img_first = false;
                 } break;
             case PROJECTOR_TYPE_DEEPSEEKOCR2:
                 {
                     img_end = "\n"; // prevent empty batch on llama-server
                     image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
+                    ov_img_first = false;
                 } break;
             case PROJECTOR_TYPE_HUNYUANVL:
                 {
@@ -640,6 +644,7 @@ struct mtmd_context {
                     img_beg = "<image>";
                     img_end = "";
                     image_preproc = std::make_unique<mtmd_image_preprocessor_granite>(ctx_v);
+                    ov_img_first = true;
                 } break;
             default:
                 throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
@@ -1079,26 +1084,38 @@ struct mtmd_tokenizer {
 
                 // for llava-uhd style, we need to handle grid too
                 // we don't care about overwriting these values for now because the case where bitmaps.size() > 1 is only for frame merging (qwen-vl), not supported by llava-uhd
-                if (tmp_preproc_out.grid_x > 0 && tmp_preproc_out.grid_y > 0) {
+                if ((tmp_preproc_out.grid_x > 0 && tmp_preproc_out.grid_y > 0)
+                        || tmp_preproc_out.has_overview()) {
                     GGML_ASSERT(bitmaps.size() == 1);
                     preproc_out.grid_x = tmp_preproc_out.grid_x;
                     preproc_out.grid_y = tmp_preproc_out.grid_y;
+                    preproc_out.overview = std::move(tmp_preproc_out.overview);
                 }
             }
 
+            LOG_DBG("%s: preproc_out has %zu entries, grid_x = %d, grid_y = %d, has_overview = %d\n",
+                    __func__, preproc_out.entries.size(), preproc_out.grid_x, preproc_out.grid_y,
+                    preproc_out.has_overview() ? 1 : 0);
+
             // handle llava-uhd style preprocessing
-            const bool has_tiling_grid = preproc_out.grid_x > 0 && preproc_out.grid_y > 0;
+            // (output either a grid, or overview-only)
+            const bool has_tiling_grid = (preproc_out.grid_x > 0 && preproc_out.grid_y > 0)
+                || preproc_out.has_overview();
+
             if (has_tiling_grid) {
                 // [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
                 GGML_ASSERT(bitmaps.size() == 1);
 
                 const int n_col = preproc_out.grid_x;
                 const int n_row = preproc_out.grid_y;
+
                 // split batch into chunks of single images
-                // NOTE: preproc_out will be invalidated after this call
                 auto chunks = split_batch_to_chunk(std::move(preproc_out), bitmaps[0]->id);
                 GGML_ASSERT(chunks.size() > 0);
 
+                // NOTE: preproc_out is invalidated after this point, do not use it anymore
+
+                // split_batch_to_chunk must always put the overview image first
                 auto ov_chunk = std::move(chunks.front());
                 chunks.erase(chunks.begin());
 
@@ -1125,7 +1142,16 @@ struct mtmd_tokenizer {
                                 std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
                                 add_text(std::string(buf.get(), buf.get() + sz - 1), true);
                             }
-                            cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
+
+                            auto & curr_chunk = chunks[y * n_col + x];
+                            auto & curr_batch = curr_chunk.tokens_image->batch_f32;
+                            if (curr_batch.entries.size() != 1) {
+                                throw std::runtime_error(string_format("%s: expect 1 image in batch_f32", __func__));
+                            }
+
+                            LOG_DBG("%s: adding slice image at row %d col %d\n", __func__, y, x);
+                            cur.entries.emplace_back(std::move(curr_chunk));
+
                             add_text(ctx->tok_sli_img_end);
                             if (!is_last_in_row) {
                                 add_text(ctx->tok_sli_img_mid);
@@ -1147,6 +1173,11 @@ struct mtmd_tokenizer {
 
             } else {
 
+                if (preproc_out.entries.size() == 0) {
+                    LOG_ERR("%s: no image tokens produced by preprocessor (ref: https://github.com/ggml-org/llama.cpp/pull/24769)\n", __func__);
+                    return 2;
+                }
+
                 size_t n_tokens = 0;
                 for (auto & e : preproc_out.entries) {
                     n_tokens += clip_n_output_tokens(ctx->ctx_v, &e);
@@ -1303,13 +1334,15 @@ struct mtmd_tokenizer {
     std::vector<mtmd_input_chunk> split_batch_to_chunk(mtmd_image_preproc_out && preproc_out, const std::string & id) {
         std::vector<mtmd_input_chunk> chunks;
 
-        for (auto & entry : preproc_out.entries) {
+        auto process_chunk = [&](clip_image_f32 && img) {
             mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-            image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, &entry);
+            image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, &img);
             image_tokens->ny = 1;
-            image_tokens->batch_f32.entries.push_back(std::move(entry));
+            image_tokens->batch_f32.entries.push_back(std::move(img));
             image_tokens->id = id;
 
+            GGML_ASSERT(image_tokens->nx > 0);
+
             mtmd_input_chunk chunk{
                 MTMD_INPUT_CHUNK_TYPE_IMAGE,
                 {}, // text tokens
@@ -1317,6 +1350,21 @@ struct mtmd_tokenizer {
                 nullptr, // audio tokens
             };
             chunks.emplace_back(std::move(chunk));
+        };
+
+        // overview image first
+        auto & overview = preproc_out.overview;
+        if (overview.nx() == 0 || overview.ny() == 0) {
+            throw std::runtime_error(string_format("%s: invalid overview image for llava-uhd style preprocessing\n", __func__));
+        }
+        process_chunk(std::move(preproc_out.overview));
+
+        // then, process slices
+        for (auto & entry : preproc_out.entries) {
+            if (entry.nx() == 0 || entry.ny() == 0) {
+                throw std::runtime_error(string_format("%s: invalid image slice for llava-uhd style preprocessing\n", __func__));
+            }
+            process_chunk(std::move(entry));
         }
 
         return chunks;
@@ -1390,57 +1438,22 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
         LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
         return 1;
     }
-    auto proj_type = clip_get_projector_type(ctx_clip);
 
     int n_embd_out = ctx->n_embd_out();
     auto n_tokens_out = image_tokens->n_tokens();
     out_embd.resize((size_t)n_embd_out * n_tokens_out);
 
-    bool ok = false;
-
-    if (clip_is_llava(ctx_clip)
-        || proj_type == PROJECTOR_TYPE_MINICPMV
-        || proj_type == PROJECTOR_TYPE_GLM_EDGE
-        || proj_type == PROJECTOR_TYPE_INTERNVL
-        || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2
-        || proj_type == PROJECTOR_TYPE_GRANITE4_VISION) {
-        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
-        const auto & entries = image_tokens->batch_f32.entries;
-        // entries may have different token counts
-        // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
-        size_t offset = 0;
-        for (size_t i = 0; i < entries.size(); i++) {
-            if (entries[i].is_placeholder()) {
-                LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i);
-                return 1;
-            }
-            int n_tokens_per_image = clip_n_output_tokens(ctx_clip, &entries[i]);
-            std::vector<float> tmp_embd((size_t)n_tokens_per_image * n_embd_out);
-            bool ok_i = clip_image_encode(
-                ctx_clip,
-                ctx->n_threads,
-                &entries[i],
-                tmp_embd);
-            if (!ok_i) {
-                LOG_ERR("%s: failed to encode image %zu\n", __func__, i);
-                return 1;
-            }
-            ok = true;
-            std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
-            offset += static_cast<size_t>(n_embd_out) * n_tokens_per_image;
-        }
-    } else {
-        if (image_tokens->is_placeholder()) {
-            LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
-            return 1;
-        }
-        ok = clip_image_batch_encode(
-            ctx_clip,
-            ctx->n_threads,
-            &image_tokens->batch_f32,
-            out_embd);
+    if (image_tokens->is_placeholder()) {
+        LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
+        return 1;
     }
 
+    bool ok = clip_image_batch_encode(
+        ctx_clip,
+        ctx->n_threads,
+        &image_tokens->batch_f32,
+        out_embd);
+
     return ok ? 0 : 1;
 }