From db52540f730de39efcf7172d4ab1f79bb50556e2 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Fri, 19 Jun 2026 01:16:16 +0200
Subject: [PATCH] mtmd: add batching support for internvl (#24775)

---
 tools/mtmd/clip.cpp            |  2 +-
 tools/mtmd/models/internvl.cpp | 18 +++++++++++-------
 tools/mtmd/models/models.h     |  1 +
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index dc62232957..17079815d4 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -534,7 +534,7 @@ ggml_tensor * clip_graph::build_vit(
 ggml_tensor * clip_graph::build_inp() {
     ggml_tensor * inp_raw = build_inp_raw();
     ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-    inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
+    inp = ggml_reshape_3d(ctx0, inp, n_patches, n_embd, n_batch);
     inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
     if (model.patch_bias) {
         inp = ggml_add(ctx0, inp, model.patch_bias);
diff --git a/tools/mtmd/models/internvl.cpp b/tools/mtmd/models/internvl.cpp
index 9aded3b97c..65d7d5a6b7 100644
--- a/tools/mtmd/models/internvl.cpp
+++ b/tools/mtmd/models/internvl.cpp
@@ -8,7 +8,9 @@ ggml_cgraph * clip_graph_internvl::build() {
     ggml_tensor * inp = build_inp();
 
     // add CLS token
-    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+    ggml_tensor * cls_repeated = ggml_repeat_4d(ctx0, model.class_embedding,
+            model.class_embedding->ne[0], 1, n_batch, 1);
+    inp = ggml_concat(ctx0, inp, cls_repeated, 1);
 
     // The larger models use a different ViT, which uses RMS norm instead of layer norm
     // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
@@ -24,14 +26,15 @@ ggml_cgraph * clip_graph_internvl::build() {
                             nullptr);
 
     // remove CLS token
-    cur = ggml_view_2d(ctx0, cur,
-        n_embd, n_patches,
-        ggml_row_size(cur->type, n_embd), 0);
+    cur = ggml_view_3d(ctx0, cur,
+        n_embd, n_patches, n_batch,
+        cur->nb[1], cur->nb[2], 0);
+    cur = ggml_cont(ctx0, cur);
 
     // pixel shuffle
     {
         const int scale_factor = model.hparams.n_merge;
-        const int bsz    = 1; // batch size, always 1 for now since we don't support batching
+        const int bsz    = n_batch;
         const int height = n_patches_y;
         const int width  = n_patches_x;
         GGML_ASSERT(scale_factor > 0);
@@ -44,9 +47,10 @@ ggml_cgraph * clip_graph_internvl::build() {
             bsz);
         cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
         // flatten to 2D
-        cur = ggml_cont_2d(ctx0, cur,
+        cur = ggml_cont_3d(ctx0, cur,
             n_embd * scale_factor * scale_factor,
-            cur->ne[1] * cur->ne[2]);
+            cur->ne[1] * cur->ne[2],
+            cur->ne[3]);
     }
 
     // projector (always using GELU activation)
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index 3a15f76829..12d5e69493 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -80,6 +80,7 @@ struct clip_graph_minicpmv4_6 : clip_graph {
 struct clip_graph_internvl : clip_graph {
     clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
+    bool support_batch() const override { return true; }
 };
 
 struct clip_graph_nemotron_v2_vl : clip_graph {