From db52540f730de39efcf7172d4ab1f79bb50556e2 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 19 Jun 2026 01:16:16 +0200 Subject: [PATCH] mtmd: add batching support for internvl (#24775) --- tools/mtmd/clip.cpp | 2 +- tools/mtmd/models/internvl.cpp | 18 +++++++++++------- tools/mtmd/models/models.h | 1 + 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index dc62232957..17079815d4 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -534,7 +534,7 @@ ggml_tensor * clip_graph::build_vit( ggml_tensor * clip_graph::build_inp() { ggml_tensor * inp_raw = build_inp_raw(); ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd); + inp = ggml_reshape_3d(ctx0, inp, n_patches, n_embd, n_batch); inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); if (model.patch_bias) { inp = ggml_add(ctx0, inp, model.patch_bias); diff --git a/tools/mtmd/models/internvl.cpp b/tools/mtmd/models/internvl.cpp index 9aded3b97c..65d7d5a6b7 100644 --- a/tools/mtmd/models/internvl.cpp +++ b/tools/mtmd/models/internvl.cpp @@ -8,7 +8,9 @@ ggml_cgraph * clip_graph_internvl::build() { ggml_tensor * inp = build_inp(); // add CLS token - inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + ggml_tensor * cls_repeated = ggml_repeat_4d(ctx0, model.class_embedding, + model.class_embedding->ne[0], 1, n_batch, 1); + inp = ggml_concat(ctx0, inp, cls_repeated, 1); // The larger models use a different ViT, which uses RMS norm instead of layer norm // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188 @@ -24,14 +26,15 @@ ggml_cgraph * clip_graph_internvl::build() { nullptr); // remove CLS token - cur = ggml_view_2d(ctx0, cur, - n_embd, n_patches, - ggml_row_size(cur->type, n_embd), 0); + cur = ggml_view_3d(ctx0, cur, + n_embd, n_patches, n_batch, + cur->nb[1], cur->nb[2], 0); + cur = ggml_cont(ctx0, cur); // pixel shuffle { const int scale_factor = model.hparams.n_merge; - const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int bsz = n_batch; const int height = n_patches_y; const int width = n_patches_x; GGML_ASSERT(scale_factor > 0); @@ -44,9 +47,10 @@ ggml_cgraph * clip_graph_internvl::build() { bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // flatten to 2D - cur = ggml_cont_2d(ctx0, cur, + cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, - cur->ne[1] * cur->ne[2]); + cur->ne[1] * cur->ne[2], + cur->ne[3]); } // projector (always using GELU activation) diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 3a15f76829..12d5e69493 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -80,6 +80,7 @@ struct clip_graph_minicpmv4_6 : clip_graph { struct clip_graph_internvl : clip_graph { clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; + bool support_batch() const override { return true; } }; struct clip_graph_nemotron_v2_vl : clip_graph {