mtmd: refactor llava-uhd overview image handling (always use ov_img_first) (#24769)

* add dedicated "overview" for mtmd_image_preproc_out

* corrections

* correct (again)

* nits

* nits (2)
This commit is contained in:
Xuan-Son Nguyen
2026-06-18 18:53:49 +02:00
committed by GitHub
parent d2c67959b3
commit 060ce1bf72
3 changed files with 113 additions and 88 deletions
+34 -36
View File
@@ -26,6 +26,13 @@ void mtmd_image_preproc_out::append(const clip_hparams & hparams, clip_image_f32
entries.push_back(std::move(img));
}
void mtmd_image_preproc_out::append_overview(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized) {
overview.from_u8(img);
if (normalized) {
overview.normalize(hparams.image_mean, hparams.image_std);
}
}
// set of tools to manipulate images
// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
struct img_tool {
@@ -607,10 +614,11 @@ private:
mtmd_image_preproc_out mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img) {
const clip_image_size original_size = img.get_size();
auto const inst = get_slice_instructions(original_size);
std::vector<clip_image_u8> imgs = slice_image(img, inst);
auto sliced = slice_image(img, inst);
mtmd_image_preproc_out output;
output.append(hparams, imgs, true);
output.append_overview(hparams, sliced.overview, true);
output.append(hparams, sliced.slices, true);
output.grid_x = inst.grid_size.width;
output.grid_y = inst.grid_size.height;
@@ -722,22 +730,15 @@ mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_ll
return res;
}
std::vector<clip_image_u8> mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst, bool overview_first) {
std::vector<clip_image_u8> output;
mtmd_image_preprocessor_llava_uhd::slice_output mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst) {
slice_output output;
// resize to overview size
clip_image_u8 resized_img;
img_tool::resize(img, resized_img, inst.overview_size, hparams.image_resize_algo_ov,
img_tool::resize(img, output.overview, inst.overview_size, hparams.image_resize_algo_ov,
hparams.image_pad_ov, hparams.image_pad_color_ov);
if (overview_first) {
output.push_back(resized_img);
}
if (inst.slices.empty()) {
// no slices, just return the resized image
if (!overview_first) {
output.push_back(resized_img);
}
// no slices, just return the overview image
return output;
}
@@ -755,11 +756,7 @@ std::vector<clip_image_u8> mtmd_image_preprocessor_llava_uhd::slice_image(const
clip_image_u8 img_slice;
img_tool::crop(refined_img, img_slice, x, y, w, h);
output.push_back(std::move(img_slice));
}
if (!overview_first) {
output.push_back(resized_img);
output.slices.push_back(std::move(img_slice));
}
return output;
@@ -1077,10 +1074,11 @@ mtmd_image_preproc_out mtmd_image_preprocessor_idefics3::preprocess(const clip_i
});
}
}
auto imgs = slice_image(img, instructions);
auto sliced = slice_image(img, instructions);
mtmd_image_preproc_out output;
output.append(hparams, imgs, true);
output.append_overview(hparams, sliced.overview, true);
output.append(hparams, sliced.slices, true);
output.grid_x = instructions.grid_size.width;
output.grid_y = instructions.grid_size.height;
return output;
@@ -1094,10 +1092,12 @@ mtmd_image_preproc_out mtmd_image_preprocessor_internvl::preprocess(const clip_i
GGML_ASSERT(!hparams.image_res_candidates.empty());
const clip_image_size original_size = img.get_size();
auto const inst = get_slice_instructions(original_size);
std::vector<clip_image_u8> imgs = slice_image(img, inst, false);
auto sliced = slice_image(img, inst);
mtmd_image_preproc_out output;
output.append(hparams, imgs, true);
// InternVL: slices first, then overview
output.append(hparams, sliced.slices, true);
output.append_overview(hparams, sliced.overview, true);
output.grid_x = inst.grid_size.width;
output.grid_y = inst.grid_size.height;
return output;
@@ -1131,9 +1131,10 @@ mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr::preprocess(const cli
img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
PAD_NEAREST, hparams.image_pad_color);
mtmd_image_preproc_out output;
output.append(hparams, padded, true);
output.grid_x = 1;
output.grid_y = 1;
output.append_overview(hparams, padded, true);
output.grid_x = 0;
output.grid_y = 0;
// TODO @ngxson : support slicing for DeepSeek-OCR, to do in another PR
return output;
}
@@ -1226,10 +1227,8 @@ mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr2::preprocess(const cl
clip_image_u8 padded;
img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
PAD_NEAREST, hparams.image_pad_color);
output.append(hparams, padded, true);
output.entries.back().add_viewsep = true;
output.grid_x = 1;
output.grid_y = 1;
output.append_overview(hparams, padded, true);
output.overview.add_viewsep = true;
return output;
}
@@ -1447,15 +1446,14 @@ mtmd_image_preproc_out mtmd_image_preprocessor_step3vl::preprocess(const clip_im
const auto instructions = build_slice_instructions(hparams, prepared.get_size());
mtmd_image_preproc_out output;
clip_image_f32 overview_f32;
// overview (normalized f32, already includes mean/std)
img_u8_resize_bilinear_to_f32(
prepared,
overview_f32,
output.overview,
hparams.image_size,
hparams.image_size,
hparams.image_mean,
hparams.image_std);
output.append(hparams, overview_f32, false);
if (instructions.slices.empty()) {
output.grid_x = 0;
@@ -1548,13 +1546,13 @@ mtmd_image_preproc_out mtmd_image_preprocessor_youtuvl::preprocess(const clip_im
mtmd_image_preproc_out mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img) {
auto output = mtmd_image_preprocessor_llava_uhd::preprocess(img);
if (output.entries.size() == 1) {
if (output.entries.size() == 0) {
// Single-tile (overview only): append one newline row.
output.entries[0].add_newline = true;
output.overview.add_newline = true;
} else {
// Multi-tile: overview gets no newline, grid tiles get one.
output.entries[0].add_newline = false;
for (size_t i = 1; i < output.entries.size(); ++i) {
output.overview.add_newline = false;
for (size_t i = 0; i < output.entries.size(); ++i) {
output.entries[i].add_newline = true;
}
}
+15 -1
View File
@@ -11,11 +11,19 @@
struct mtmd_image_preproc_out {
std::vector<clip_image_f32> entries;
// grid size is required for llava-uhd style models
clip_image_f32 overview; // overview image (downscaled image)
int grid_x = 0;
int grid_y = 0;
void append(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized = true);
void append(const clip_hparams & hparams, const std::vector<clip_image_u8> & imgs, bool normalized = true);
void append(const clip_hparams & hparams, clip_image_f32 & img, bool normalized = true);
void append_overview(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized = true);
bool has_overview() const {
return overview.nx() > 0 || overview.ny() > 0;
}
};
// base class, models must inherit from this class
@@ -46,6 +54,8 @@ struct mtmd_image_preprocessor {
* [overview] --> [slice 1] --> [slice 2]
* | |
* +--> [slice 3] --> [slice 4]
*
* NOTE: for the ordering of overview, set "ov_img_first" on the mtmd_context
*/
struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
@@ -67,7 +77,11 @@ struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
// LFM2 override this function to implement its custom slicing logic
virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);
std::vector<clip_image_u8> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);
struct slice_output {
clip_image_u8 overview;
std::vector<clip_image_u8> slices;
};
slice_output slice_image(const clip_image_u8 & img, const slice_instructions & inst);
private:
clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);
+64 -51
View File
@@ -516,6 +516,7 @@ struct mtmd_context {
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
ov_img_first = false;
} break;
case PROJECTOR_TYPE_STEP3VL:
{
@@ -539,6 +540,7 @@ struct mtmd_context {
img_beg = "<img>";
img_end = "</img>";
image_preproc = std::make_unique<mtmd_image_preprocessor_internvl>(ctx_v);
ov_img_first = false;
} break;
case PROJECTOR_TYPE_KIMIVL:
{
@@ -615,11 +617,13 @@ struct mtmd_context {
{
img_end = "\n"; // prevent empty batch on llama-server
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
ov_img_first = false;
} break;
case PROJECTOR_TYPE_DEEPSEEKOCR2:
{
img_end = "\n"; // prevent empty batch on llama-server
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
ov_img_first = false;
} break;
case PROJECTOR_TYPE_HUNYUANVL:
{
@@ -640,6 +644,7 @@ struct mtmd_context {
img_beg = "<image>";
img_end = "";
image_preproc = std::make_unique<mtmd_image_preprocessor_granite>(ctx_v);
ov_img_first = true;
} break;
default:
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
@@ -1079,26 +1084,38 @@ struct mtmd_tokenizer {
// for llava-uhd style, we need to handle grid too
// we don't care about overwriting these values for now because the case where bitmaps.size() > 1 is only for frame merging (qwen-vl), not supported by llava-uhd
if (tmp_preproc_out.grid_x > 0 && tmp_preproc_out.grid_y > 0) {
if ((tmp_preproc_out.grid_x > 0 && tmp_preproc_out.grid_y > 0)
|| tmp_preproc_out.has_overview()) {
GGML_ASSERT(bitmaps.size() == 1);
preproc_out.grid_x = tmp_preproc_out.grid_x;
preproc_out.grid_y = tmp_preproc_out.grid_y;
preproc_out.overview = std::move(tmp_preproc_out.overview);
}
}
LOG_DBG("%s: preproc_out has %zu entries, grid_x = %d, grid_y = %d, has_overview = %d\n",
__func__, preproc_out.entries.size(), preproc_out.grid_x, preproc_out.grid_y,
preproc_out.has_overview() ? 1 : 0);
// handle llava-uhd style preprocessing
const bool has_tiling_grid = preproc_out.grid_x > 0 && preproc_out.grid_y > 0;
// (output either a grid, or overview-only)
const bool has_tiling_grid = (preproc_out.grid_x > 0 && preproc_out.grid_y > 0)
|| preproc_out.has_overview();
if (has_tiling_grid) {
// [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
GGML_ASSERT(bitmaps.size() == 1);
const int n_col = preproc_out.grid_x;
const int n_row = preproc_out.grid_y;
// split batch into chunks of single images
// NOTE: preproc_out will be invalidated after this call
auto chunks = split_batch_to_chunk(std::move(preproc_out), bitmaps[0]->id);
GGML_ASSERT(chunks.size() > 0);
// NOTE: preproc_out is invalidated after this point, do not use it anymore
// split_batch_to_chunk must always put the overview image first
auto ov_chunk = std::move(chunks.front());
chunks.erase(chunks.begin());
@@ -1125,7 +1142,16 @@ struct mtmd_tokenizer {
std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
add_text(std::string(buf.get(), buf.get() + sz - 1), true);
}
cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
auto & curr_chunk = chunks[y * n_col + x];
auto & curr_batch = curr_chunk.tokens_image->batch_f32;
if (curr_batch.entries.size() != 1) {
throw std::runtime_error(string_format("%s: expect 1 image in batch_f32", __func__));
}
LOG_DBG("%s: adding slice image at row %d col %d\n", __func__, y, x);
cur.entries.emplace_back(std::move(curr_chunk));
add_text(ctx->tok_sli_img_end);
if (!is_last_in_row) {
add_text(ctx->tok_sli_img_mid);
@@ -1147,6 +1173,11 @@ struct mtmd_tokenizer {
} else {
if (preproc_out.entries.size() == 0) {
LOG_ERR("%s: no image tokens produced by preprocessor (ref: https://github.com/ggml-org/llama.cpp/pull/24769)\n", __func__);
return 2;
}
size_t n_tokens = 0;
for (auto & e : preproc_out.entries) {
n_tokens += clip_n_output_tokens(ctx->ctx_v, &e);
@@ -1303,13 +1334,15 @@ struct mtmd_tokenizer {
std::vector<mtmd_input_chunk> split_batch_to_chunk(mtmd_image_preproc_out && preproc_out, const std::string & id) {
std::vector<mtmd_input_chunk> chunks;
for (auto & entry : preproc_out.entries) {
auto process_chunk = [&](clip_image_f32 && img) {
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, &entry);
image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, &img);
image_tokens->ny = 1;
image_tokens->batch_f32.entries.push_back(std::move(entry));
image_tokens->batch_f32.entries.push_back(std::move(img));
image_tokens->id = id;
GGML_ASSERT(image_tokens->nx > 0);
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_IMAGE,
{}, // text tokens
@@ -1317,6 +1350,21 @@ struct mtmd_tokenizer {
nullptr, // audio tokens
};
chunks.emplace_back(std::move(chunk));
};
// overview image first
auto & overview = preproc_out.overview;
if (overview.nx() == 0 || overview.ny() == 0) {
throw std::runtime_error(string_format("%s: invalid overview image for llava-uhd style preprocessing\n", __func__));
}
process_chunk(std::move(preproc_out.overview));
// then, process slices
for (auto & entry : preproc_out.entries) {
if (entry.nx() == 0 || entry.ny() == 0) {
throw std::runtime_error(string_format("%s: invalid image slice for llava-uhd style preprocessing\n", __func__));
}
process_chunk(std::move(entry));
}
return chunks;
@@ -1390,57 +1438,22 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
return 1;
}
auto proj_type = clip_get_projector_type(ctx_clip);
int n_embd_out = ctx->n_embd_out();
auto n_tokens_out = image_tokens->n_tokens();
out_embd.resize((size_t)n_embd_out * n_tokens_out);
bool ok = false;
if (clip_is_llava(ctx_clip)
|| proj_type == PROJECTOR_TYPE_MINICPMV
|| proj_type == PROJECTOR_TYPE_GLM_EDGE
|| proj_type == PROJECTOR_TYPE_INTERNVL
|| proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2
|| proj_type == PROJECTOR_TYPE_GRANITE4_VISION) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
const auto & entries = image_tokens->batch_f32.entries;
// entries may have different token counts
// e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
size_t offset = 0;
for (size_t i = 0; i < entries.size(); i++) {
if (entries[i].is_placeholder()) {
LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i);
return 1;
}
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, &entries[i]);
std::vector<float> tmp_embd((size_t)n_tokens_per_image * n_embd_out);
bool ok_i = clip_image_encode(
ctx_clip,
ctx->n_threads,
&entries[i],
tmp_embd);
if (!ok_i) {
LOG_ERR("%s: failed to encode image %zu\n", __func__, i);
return 1;
}
ok = true;
std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
offset += static_cast<size_t>(n_embd_out) * n_tokens_per_image;
}
} else {
if (image_tokens->is_placeholder()) {
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
return 1;
}
ok = clip_image_batch_encode(
ctx_clip,
ctx->n_threads,
&image_tokens->batch_f32,
out_embd);
if (image_tokens->is_placeholder()) {
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
return 1;
}
bool ok = clip_image_batch_encode(
ctx_clip,
ctx->n_threads,
&image_tokens->batch_f32,
out_embd);
return ok ? 0 : 1;
}