diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp index a807fefda3..57afb542d4 100644 --- a/tools/mtmd/mtmd-image.cpp +++ b/tools/mtmd/mtmd-image.cpp @@ -26,6 +26,13 @@ void mtmd_image_preproc_out::append(const clip_hparams & hparams, clip_image_f32 entries.push_back(std::move(img)); } +void mtmd_image_preproc_out::append_overview(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized) { + overview.from_u8(img); + if (normalized) { + overview.normalize(hparams.image_mean, hparams.image_std); + } +} + // set of tools to manipulate images // in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv struct img_tool { @@ -607,10 +614,11 @@ private: mtmd_image_preproc_out mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img) { const clip_image_size original_size = img.get_size(); auto const inst = get_slice_instructions(original_size); - std::vector imgs = slice_image(img, inst); + auto sliced = slice_image(img, inst); mtmd_image_preproc_out output; - output.append(hparams, imgs, true); + output.append_overview(hparams, sliced.overview, true); + output.append(hparams, sliced.slices, true); output.grid_x = inst.grid_size.width; output.grid_y = inst.grid_size.height; @@ -722,22 +730,15 @@ mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_ll return res; } -std::vector mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst, bool overview_first) { - std::vector output; +mtmd_image_preprocessor_llava_uhd::slice_output mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst) { + slice_output output; // resize to overview size - clip_image_u8 resized_img; - img_tool::resize(img, resized_img, inst.overview_size, hparams.image_resize_algo_ov, + img_tool::resize(img, output.overview, inst.overview_size, hparams.image_resize_algo_ov, hparams.image_pad_ov, hparams.image_pad_color_ov); - if (overview_first) { - output.push_back(resized_img); - } if (inst.slices.empty()) { - // no slices, just return the resized image - if (!overview_first) { - output.push_back(resized_img); - } + // no slices, just return the overview image return output; } @@ -755,11 +756,7 @@ std::vector mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 img_slice; img_tool::crop(refined_img, img_slice, x, y, w, h); - output.push_back(std::move(img_slice)); - } - - if (!overview_first) { - output.push_back(resized_img); + output.slices.push_back(std::move(img_slice)); } return output; @@ -1077,10 +1074,11 @@ mtmd_image_preproc_out mtmd_image_preprocessor_idefics3::preprocess(const clip_i }); } } - auto imgs = slice_image(img, instructions); + auto sliced = slice_image(img, instructions); mtmd_image_preproc_out output; - output.append(hparams, imgs, true); + output.append_overview(hparams, sliced.overview, true); + output.append(hparams, sliced.slices, true); output.grid_x = instructions.grid_size.width; output.grid_y = instructions.grid_size.height; return output; @@ -1094,10 +1092,12 @@ mtmd_image_preproc_out mtmd_image_preprocessor_internvl::preprocess(const clip_i GGML_ASSERT(!hparams.image_res_candidates.empty()); const clip_image_size original_size = img.get_size(); auto const inst = get_slice_instructions(original_size); - std::vector imgs = slice_image(img, inst, false); + auto sliced = slice_image(img, inst); mtmd_image_preproc_out output; - output.append(hparams, imgs, true); + // InternVL: slices first, then overview + output.append(hparams, sliced.slices, true); + output.append_overview(hparams, sliced.overview, true); output.grid_x = inst.grid_size.width; output.grid_y = inst.grid_size.height; return output; @@ -1131,9 +1131,10 @@ mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr::preprocess(const cli img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW, PAD_NEAREST, hparams.image_pad_color); mtmd_image_preproc_out output; - output.append(hparams, padded, true); - output.grid_x = 1; - output.grid_y = 1; + output.append_overview(hparams, padded, true); + output.grid_x = 0; + output.grid_y = 0; + // TODO @ngxson : support slicing for DeepSeek-OCR, to do in another PR return output; } @@ -1226,10 +1227,8 @@ mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr2::preprocess(const cl clip_image_u8 padded; img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW, PAD_NEAREST, hparams.image_pad_color); - output.append(hparams, padded, true); - output.entries.back().add_viewsep = true; - output.grid_x = 1; - output.grid_y = 1; + output.append_overview(hparams, padded, true); + output.overview.add_viewsep = true; return output; } @@ -1447,15 +1446,14 @@ mtmd_image_preproc_out mtmd_image_preprocessor_step3vl::preprocess(const clip_im const auto instructions = build_slice_instructions(hparams, prepared.get_size()); mtmd_image_preproc_out output; - clip_image_f32 overview_f32; + // overview (normalized f32, already includes mean/std) img_u8_resize_bilinear_to_f32( prepared, - overview_f32, + output.overview, hparams.image_size, hparams.image_size, hparams.image_mean, hparams.image_std); - output.append(hparams, overview_f32, false); if (instructions.slices.empty()) { output.grid_x = 0; @@ -1548,13 +1546,13 @@ mtmd_image_preproc_out mtmd_image_preprocessor_youtuvl::preprocess(const clip_im mtmd_image_preproc_out mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img) { auto output = mtmd_image_preprocessor_llava_uhd::preprocess(img); - if (output.entries.size() == 1) { + if (output.entries.size() == 0) { // Single-tile (overview only): append one newline row. - output.entries[0].add_newline = true; + output.overview.add_newline = true; } else { // Multi-tile: overview gets no newline, grid tiles get one. - output.entries[0].add_newline = false; - for (size_t i = 1; i < output.entries.size(); ++i) { + output.overview.add_newline = false; + for (size_t i = 0; i < output.entries.size(); ++i) { output.entries[i].add_newline = true; } } diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h index 8819a135a1..f458e39e76 100644 --- a/tools/mtmd/mtmd-image.h +++ b/tools/mtmd/mtmd-image.h @@ -11,11 +11,19 @@ struct mtmd_image_preproc_out { std::vector entries; // grid size is required for llava-uhd style models + + clip_image_f32 overview; // overview image (downscaled image) int grid_x = 0; int grid_y = 0; + void append(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized = true); void append(const clip_hparams & hparams, const std::vector & imgs, bool normalized = true); void append(const clip_hparams & hparams, clip_image_f32 & img, bool normalized = true); + + void append_overview(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized = true); + bool has_overview() const { + return overview.nx() > 0 || overview.ny() > 0; + } }; // base class, models must inherit from this class @@ -46,6 +54,8 @@ struct mtmd_image_preprocessor { * [overview] --> [slice 1] --> [slice 2] * | | * +--> [slice 3] --> [slice 4] + * + * NOTE: for the ordering of overview, set "ov_img_first" on the mtmd_context */ struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor { mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} @@ -67,7 +77,11 @@ struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor { // LFM2 override this function to implement its custom slicing logic virtual slice_instructions get_slice_instructions(const clip_image_size & original_size); - std::vector slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true); + struct slice_output { + clip_image_u8 overview; + std::vector slices; + }; + slice_output slice_image(const clip_image_u8 & img, const slice_instructions & inst); private: clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index f9ee021ddb..abba2ebf2c 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -516,6 +516,7 @@ struct mtmd_context { LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n" " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__); image_preproc = std::make_unique(ctx_v); + ov_img_first = false; } break; case PROJECTOR_TYPE_STEP3VL: { @@ -539,6 +540,7 @@ struct mtmd_context { img_beg = ""; img_end = ""; image_preproc = std::make_unique(ctx_v); + ov_img_first = false; } break; case PROJECTOR_TYPE_KIMIVL: { @@ -615,11 +617,13 @@ struct mtmd_context { { img_end = "\n"; // prevent empty batch on llama-server image_preproc = std::make_unique(ctx_v); + ov_img_first = false; } break; case PROJECTOR_TYPE_DEEPSEEKOCR2: { img_end = "\n"; // prevent empty batch on llama-server image_preproc = std::make_unique(ctx_v); + ov_img_first = false; } break; case PROJECTOR_TYPE_HUNYUANVL: { @@ -640,6 +644,7 @@ struct mtmd_context { img_beg = ""; img_end = ""; image_preproc = std::make_unique(ctx_v); + ov_img_first = true; } break; default: throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj)); @@ -1079,26 +1084,38 @@ struct mtmd_tokenizer { // for llava-uhd style, we need to handle grid too // we don't care about overwriting these values for now because the case where bitmaps.size() > 1 is only for frame merging (qwen-vl), not supported by llava-uhd - if (tmp_preproc_out.grid_x > 0 && tmp_preproc_out.grid_y > 0) { + if ((tmp_preproc_out.grid_x > 0 && tmp_preproc_out.grid_y > 0) + || tmp_preproc_out.has_overview()) { GGML_ASSERT(bitmaps.size() == 1); preproc_out.grid_x = tmp_preproc_out.grid_x; preproc_out.grid_y = tmp_preproc_out.grid_y; + preproc_out.overview = std::move(tmp_preproc_out.overview); } } + LOG_DBG("%s: preproc_out has %zu entries, grid_x = %d, grid_y = %d, has_overview = %d\n", + __func__, preproc_out.entries.size(), preproc_out.grid_x, preproc_out.grid_y, + preproc_out.has_overview() ? 1 : 0); + // handle llava-uhd style preprocessing - const bool has_tiling_grid = preproc_out.grid_x > 0 && preproc_out.grid_y > 0; + // (output either a grid, or overview-only) + const bool has_tiling_grid = (preproc_out.grid_x > 0 && preproc_out.grid_y > 0) + || preproc_out.has_overview(); + if (has_tiling_grid) { // [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now GGML_ASSERT(bitmaps.size() == 1); const int n_col = preproc_out.grid_x; const int n_row = preproc_out.grid_y; + // split batch into chunks of single images - // NOTE: preproc_out will be invalidated after this call auto chunks = split_batch_to_chunk(std::move(preproc_out), bitmaps[0]->id); GGML_ASSERT(chunks.size() > 0); + // NOTE: preproc_out is invalidated after this point, do not use it anymore + + // split_batch_to_chunk must always put the overview image first auto ov_chunk = std::move(chunks.front()); chunks.erase(chunks.begin()); @@ -1125,7 +1142,16 @@ struct mtmd_tokenizer { std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1); add_text(std::string(buf.get(), buf.get() + sz - 1), true); } - cur.entries.emplace_back(std::move(chunks[y * n_col + x])); + + auto & curr_chunk = chunks[y * n_col + x]; + auto & curr_batch = curr_chunk.tokens_image->batch_f32; + if (curr_batch.entries.size() != 1) { + throw std::runtime_error(string_format("%s: expect 1 image in batch_f32", __func__)); + } + + LOG_DBG("%s: adding slice image at row %d col %d\n", __func__, y, x); + cur.entries.emplace_back(std::move(curr_chunk)); + add_text(ctx->tok_sli_img_end); if (!is_last_in_row) { add_text(ctx->tok_sli_img_mid); @@ -1147,6 +1173,11 @@ struct mtmd_tokenizer { } else { + if (preproc_out.entries.size() == 0) { + LOG_ERR("%s: no image tokens produced by preprocessor (ref: https://github.com/ggml-org/llama.cpp/pull/24769)\n", __func__); + return 2; + } + size_t n_tokens = 0; for (auto & e : preproc_out.entries) { n_tokens += clip_n_output_tokens(ctx->ctx_v, &e); @@ -1303,13 +1334,15 @@ struct mtmd_tokenizer { std::vector split_batch_to_chunk(mtmd_image_preproc_out && preproc_out, const std::string & id) { std::vector chunks; - for (auto & entry : preproc_out.entries) { + auto process_chunk = [&](clip_image_f32 && img) { mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); - image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, &entry); + image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, &img); image_tokens->ny = 1; - image_tokens->batch_f32.entries.push_back(std::move(entry)); + image_tokens->batch_f32.entries.push_back(std::move(img)); image_tokens->id = id; + GGML_ASSERT(image_tokens->nx > 0); + mtmd_input_chunk chunk{ MTMD_INPUT_CHUNK_TYPE_IMAGE, {}, // text tokens @@ -1317,6 +1350,21 @@ struct mtmd_tokenizer { nullptr, // audio tokens }; chunks.emplace_back(std::move(chunk)); + }; + + // overview image first + auto & overview = preproc_out.overview; + if (overview.nx() == 0 || overview.ny() == 0) { + throw std::runtime_error(string_format("%s: invalid overview image for llava-uhd style preprocessing\n", __func__)); + } + process_chunk(std::move(preproc_out.overview)); + + // then, process slices + for (auto & entry : preproc_out.entries) { + if (entry.nx() == 0 || entry.ny() == 0) { + throw std::runtime_error(string_format("%s: invalid image slice for llava-uhd style preprocessing\n", __func__)); + } + process_chunk(std::move(entry)); } return chunks; @@ -1390,57 +1438,22 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__); return 1; } - auto proj_type = clip_get_projector_type(ctx_clip); int n_embd_out = ctx->n_embd_out(); auto n_tokens_out = image_tokens->n_tokens(); out_embd.resize((size_t)n_embd_out * n_tokens_out); - bool ok = false; - - if (clip_is_llava(ctx_clip) - || proj_type == PROJECTOR_TYPE_MINICPMV - || proj_type == PROJECTOR_TYPE_GLM_EDGE - || proj_type == PROJECTOR_TYPE_INTERNVL - || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2 - || proj_type == PROJECTOR_TYPE_GRANITE4_VISION) { - // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() - const auto & entries = image_tokens->batch_f32.entries; - // entries may have different token counts - // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view - size_t offset = 0; - for (size_t i = 0; i < entries.size(); i++) { - if (entries[i].is_placeholder()) { - LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i); - return 1; - } - int n_tokens_per_image = clip_n_output_tokens(ctx_clip, &entries[i]); - std::vector tmp_embd((size_t)n_tokens_per_image * n_embd_out); - bool ok_i = clip_image_encode( - ctx_clip, - ctx->n_threads, - &entries[i], - tmp_embd); - if (!ok_i) { - LOG_ERR("%s: failed to encode image %zu\n", __func__, i); - return 1; - } - ok = true; - std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset); - offset += static_cast(n_embd_out) * n_tokens_per_image; - } - } else { - if (image_tokens->is_placeholder()) { - LOG_ERR("%s: image tokens batch is placeholder\n", __func__); - return 1; - } - ok = clip_image_batch_encode( - ctx_clip, - ctx->n_threads, - &image_tokens->batch_f32, - out_embd); + if (image_tokens->is_placeholder()) { + LOG_ERR("%s: image tokens batch is placeholder\n", __func__); + return 1; } + bool ok = clip_image_batch_encode( + ctx_clip, + ctx->n_threads, + &image_tokens->batch_f32, + out_embd); + return ok ? 0 : 1; }