diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 48796b6306..08ed0b3412 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -70,6 +70,7 @@ struct clip_hparams { std::vector image_res_candidates; int32_t preproc_min_tiles = 0; int32_t preproc_max_tiles = 0; + int32_t preproc_tile_size = 0; // local tile size (deepseek-ocr) resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC; resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR; pad_style image_pad_rf = PAD_CEIL; // padding style for the refined image (e.g. llava-1.6) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 208486fd15..f83def0854 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1569,7 +1569,16 @@ struct clip_model_loader { get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true); get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true); get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); + hparams.preproc_min_tiles = 2; + if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR) { + hparams.preproc_max_tiles = 9; + hparams.preproc_tile_size = 640; + // the CLIP/ViT body runs its layernorms at 1e-5 (the SAM stage uses 1e-6) + hparams.eps = 1e-5f; + } if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) { + hparams.preproc_max_tiles = 6; + hparams.preproc_tile_size = 768; // qwen2 encoder is GQA, requires KEY_N_HEAD_KV get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv); } @@ -3182,6 +3191,16 @@ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batc return batch->entries[idx].get(); } +std::vector clip_get_newline_embd(const struct clip_ctx * ctx) { + const ggml_tensor * nl = ctx->model.image_newline; + if (nl == nullptr || nl->type != GGML_TYPE_F32) { + return {}; + } + std::vector out(ggml_nelements(nl)); + ggml_backend_tensor_get(nl, out.data(), 0, ggml_nbytes(nl)); + return out; +} + void clip_free(clip_ctx * ctx) { if (ctx == nullptr) { return; @@ -3222,6 +3241,9 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * return (img->nx() / params.patch_size) / 2; case PROJECTOR_TYPE_STEP3VL: return img->nx() / (params.patch_size * params.n_merge); + case PROJECTOR_TYPE_DEEPSEEKOCR: + case PROJECTOR_TYPE_DEEPSEEKOCR2: + return (img->nx() / params.patch_size) / 4; default: break; } @@ -3431,10 +3453,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im // E.g., 64x64 -> 16x16 patches n_patches /= 16; - // build_global_local_features adds image newlines and view separator - // Formula: h*(w+1) + 1 where h = w = sqrt(n_patches) - int h = static_cast(std::sqrt(static_cast(n_patches))); - n_patches = h * (h + 1) + 1; + if (img->add_viewsep) { + // global view: one image-newline per token-row + trailing view separator + const int h = static_cast(std::sqrt(static_cast(n_patches))); + n_patches = h * (h + 1) + 1; + } } break; case PROJECTOR_TYPE_HUNYUANVL: { diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 7197af8569..7b1a98a9aa 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -97,6 +97,9 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data +// read the image-newline embedding from the backend; empty if the model has none +std::vector clip_get_newline_embd(const struct clip_ctx * ctx); + bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, std::vector & out_vec); bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, std::vector & out_batch_embd); diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp index c3c22d0a4b..fbd4e2f730 100644 --- a/tools/mtmd/models/deepseekocr.cpp +++ b/tools/mtmd/models/deepseekocr.cpp @@ -96,6 +96,8 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) { const int n_heads = hparams.sam_n_head; const int d_heads = n_embd / n_heads; const int window = hparams.attn_window_size; + // SAM stage runs its layernorms at 1e-6 + const float sam_eps = 1e-6f; ggml_tensor * inpL; @@ -134,7 +136,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) { ggml_tensor * shortcut = cur; // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, sam_eps, il); const int64_t w0 = cur->ne[1]; const int64_t h0 = cur->ne[2]; @@ -214,7 +216,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) { ggml_tensor * inpFF = cur; // layernorm2 - cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, sam_eps, il); // ffn cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, @@ -229,12 +231,12 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) { cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1); cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); - cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, sam_eps, -1); cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1); cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); - cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, sam_eps, -1); cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1); @@ -303,16 +305,17 @@ ggml_cgraph * clip_graph_deepseekocr::build() { cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); cur = ggml_add(ctx0, cur, model.mm_fc_b); - const auto h = static_cast(std::sqrt(static_cast(cur->ne[1]))); - const auto w = h; - const auto n_dim = cur->ne[0]; + // global view: weave one newline per row + trailing view separator + if (img.add_viewsep) { + const auto h = static_cast(std::sqrt(static_cast(cur->ne[1]))); + const auto w = h; + const auto n_dim = cur->ne[0]; - ggml_tensor * imgnl; - - imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1); - cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h); - cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h); - cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, h*(w+1) + 1) + ggml_tensor * imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1); + cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h); + cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h); + cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, h*(w+1) + 1) + } cb(cur, "dsocr_output", -1); diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp index bedf44e07c..dd83515e42 100644 --- a/tools/mtmd/mtmd-image.cpp +++ b/tools/mtmd/mtmd-image.cpp @@ -1112,46 +1112,7 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli // mtmd_image_preprocessor_deepseekocr // -bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { - static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ }; - // TODO: support 512 (tiny) and 640 (small) once we have eval data for them - - const int64_t orig_area = static_cast(img.get_size().area()); - - size_t mode_i = 0; - int64_t min_diff = std::numeric_limits::max(); - for (size_t i = 0; i < std::size(native_resolutions); i++) { - const int64_t r = native_resolutions[i]; - const int64_t diff = std::abs(orig_area - r * r); - if (diff < min_diff) { - mode_i = i; - min_diff = diff; - } - } - const int image_size = native_resolutions[mode_i]; - - // Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for - // byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor. - clip_image_u8 padded; - img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW, - PAD_NEAREST, hparams.image_pad_color); - - clip_image_f32_ptr res(clip_image_f32_init()); - img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std); - output.entries.push_back(std::move(res)); - - output.grid_x = 1; - output.grid_y = 1; - return true; -} - -// -// mtmd_image_preprocessor_deepseekocr2 -// - -// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles -// sorted by tile count -std::vector mtmd_image_preprocessor_deepseekocr2::get_target_ratios() { +std::vector mtmd_image_preprocessor_deepseekocr::get_target_ratios() const { std::vector ratios; for (int n = min_tiles; n <= max_tiles; n++) { for (int w = 1; w <= n; w++) { @@ -1178,13 +1139,11 @@ std::vector mtmd_image_preprocessor_deepseekocr2::get_target_ra return ratios; } -// pick the grid whose aspect ratio is closest to the image -// on a tie, prefer the larger grid when the image fits -clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio( +clip_image_size mtmd_image_preprocessor_deepseekocr::find_closest_aspect_ratio( float aspect_ratio, const std::vector & target_ratios, int width, - int height) { + int height) const { float best_ratio_diff = std::numeric_limits::max(); clip_image_size best_ratio = { 1, 1 }; const float area = static_cast(width * height); @@ -1205,23 +1164,26 @@ clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio( return best_ratio; } -bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { - // emit 768x768 local tiles when the image is larger than a tile in either - // dimension, then always a 1024x1024 global view. order: [tiles..., global]. +bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { + // output order: [local tiles..., global] + int grid_w = 1; + int grid_h = 1; const auto img_size = img.get_size(); if (img_size.width > tile_size || img_size.height > tile_size) { const float aspect_ratio = static_cast(img_size.width) / img_size.height; const auto target_ratios = get_target_ratios(); - const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height); + const clip_image_size grid = + find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height); + grid_w = grid.width; + grid_h = grid.height; - // stretch onto the grid (no aspect preserve), then crop tiles row-major. clip_image_u8 refined; - img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height }, - RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE); + img_tool::resize(img, refined, { tile_size * grid_w, tile_size * grid_h }, RESIZE_ALGO_BICUBIC_PILLOW, + PAD_NONE); - for (int row = 0; row < grid.height; row++) { - for (int col = 0; col < grid.width; col++) { + for (int row = 0; row < grid_h; row++) { + for (int col = 0; col < grid_w; col++) { clip_image_u8 tile; img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size); clip_image_f32_ptr res(clip_image_f32_init()); @@ -1231,17 +1193,17 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, } } - // global view: aspect-preserving fit-and-pad to base_size. + // global view: aspect-preserving fit-and-pad to base_size clip_image_u8 padded; - img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW, - PAD_NEAREST, hparams.image_pad_color); + img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW, PAD_NEAREST, + hparams.image_pad_color); clip_image_f32_ptr global(clip_image_f32_init()); img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std); global->add_viewsep = true; output.entries.push_back(std::move(global)); - - output.grid_x = 1; - output.grid_y = 1; + output.grid_x = grid_w; + output.grid_y = grid_h; + LOG_DBG("%s: grid size: %d x %d (%d tiles) + global view\n", __func__, grid_w, grid_h, grid_w * grid_h); return true; } diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h index 91a5bc253e..5ff4a3789f 100644 --- a/tools/mtmd/mtmd-image.h +++ b/tools/mtmd/mtmd-image.h @@ -139,29 +139,27 @@ struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd { bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; }; +// DeepSeek-OCR (v1/v2) global view + optional local tile grid struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor { - mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} - bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; -}; - -// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local -// tiles when the image is larger than a tile in either dimension. -struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor { - static constexpr int base_size = 1024; // global view - static constexpr int tile_size = 768; // local tile - static constexpr int min_tiles = 2; - static constexpr int max_tiles = 6; - - mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} + mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) + : mtmd_image_preprocessor(ctx), + base_size(hparams.image_size), + tile_size(hparams.preproc_tile_size), + min_tiles(hparams.preproc_min_tiles), + max_tiles(hparams.preproc_max_tiles) {} bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; private: - static std::vector get_target_ratios(); - static clip_image_size find_closest_aspect_ratio( - float aspect_ratio, - const std::vector & target_ratios, - int width, - int height); + int base_size; // global view + int tile_size; // each tile + int min_tiles; + int max_tiles; + + std::vector get_target_ratios() const; + clip_image_size find_closest_aspect_ratio( + float aspect_ratio, + const std::vector & target_ratios, + int width, int height) const; }; // custom image preprocessing for Step3VL diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index ad709227f7..9034b3496f 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -612,14 +612,10 @@ struct mtmd_context { image_preproc = std::make_unique(ctx_v); } break; case PROJECTOR_TYPE_DEEPSEEKOCR: - { - img_end = "\n"; // prevent empty batch on llama-server - image_preproc = std::make_unique(ctx_v); - } break; case PROJECTOR_TYPE_DEEPSEEKOCR2: { img_end = "\n"; // prevent empty batch on llama-server - image_preproc = std::make_unique(ctx_v); + image_preproc = std::make_unique(ctx_v); } break; case PROJECTOR_TYPE_HUNYUANVL: { @@ -1169,11 +1165,18 @@ struct mtmd_tokenizer { } else { size_t n_tokens = 0; - for (const auto & e : batch_f32.entries) { - n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get()); - if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) { - // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image - break; + if (ctx->proj_type_v() == PROJECTOR_TYPE_DEEPSEEKOCR && batch_f32.entries.size() > 1) { + // v1 weaves the local tiles into a grid (one image-newline per token-row), then the global view + const int h = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get()); + n_tokens = (h * batch_f32.grid_x + 1) * (h * batch_f32.grid_y); + n_tokens += clip_n_output_tokens(ctx->ctx_v, batch_f32.entries.back().get()); + } else { + for (const auto & e : batch_f32.entries) { + n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get()); + if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) { + // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image + break; + } } } @@ -1399,6 +1402,38 @@ int32_t mtmd_tokenize(mtmd_context * ctx, } } +// Stitch the tiles in raw, one newline per token-row, append the overview (raw's last chunk). +// Example, 2x2 grid of tiles A B / C D: +// raw = [ A B C D ] +// out = A.row0 B.row0 n, A.row1 B.row1 n, ..., C.row0 D.row0 n, ..., +static void stitch_tile_grid(clip_ctx * ctx, const clip_image_f32_batch & batch, + const std::vector & raw, int n_embd, float * out) { + const auto & entries = batch.entries; + const int n_tiles = static_cast(entries.size()) - 1; // overview is last + GGML_ASSERT(n_tiles == batch.grid_x * batch.grid_y); + const int tile_h = clip_n_output_tokens_x(ctx, entries[0].get()); + const size_t row_sz = static_cast(tile_h) * n_embd; + const size_t tile_sz = static_cast(tile_h) * row_sz; + const std::vector newline = clip_get_newline_embd(ctx); + GGML_ASSERT(!newline.empty()); + + for (int r = 0; r < batch.grid_y; r++) { + for (int pr = 0; pr < tile_h; pr++) { + for (int c = 0; c < batch.grid_x; c++) { + const float * tile = raw.data() + static_cast(r * batch.grid_x + c) * tile_sz; + memcpy(out, tile + static_cast(pr) * row_sz, row_sz * sizeof(float)); + out += row_sz; + } + memcpy(out, newline.data(), static_cast(n_embd) * sizeof(float)); + out += n_embd; + } + } + // overview = raw's last encoded chunk; size it from the entry, not raw.size() (raw is over-allocated) + const size_t global_off = static_cast(n_tiles) * tile_sz; + const size_t global_sz = static_cast(clip_n_output_tokens(ctx, entries.back().get())) * n_embd; + memcpy(out, raw.data() + global_off, global_sz * sizeof(float)); +} + static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector & out_embd) { clip_ctx * ctx_clip = ctx->ctx_v; if (!ctx_clip) { @@ -1417,12 +1452,17 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im || proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE || proj_type == PROJECTOR_TYPE_INTERNVL + || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2 || proj_type == PROJECTOR_TYPE_GRANITE4_VISION) { // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() const auto & entries = image_tokens->batch_f32.entries; // entries may have different token counts // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view + // DeepSeek-OCR v1, when multi-view, weaves its tiles into a grid (see stitch_tile_grid) + const bool is_dsocr_mlt = proj_type == PROJECTOR_TYPE_DEEPSEEKOCR && entries.size() > 1; + std::vector raw(is_dsocr_mlt ? static_cast(n_embd_out) * n_tokens_out : 0); + float * dst = is_dsocr_mlt ? raw.data() : out_embd.data(); size_t offset = 0; for (size_t i = 0; i < entries.size(); i++) { if (entries[i]->is_placeholder()) { @@ -1441,9 +1481,12 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im return 1; } ok = true; - std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset); + std::copy(tmp_embd.begin(), tmp_embd.end(), dst + offset); offset += static_cast(n_embd_out) * n_tokens_per_image; } + if (is_dsocr_mlt) { + stitch_tile_grid(ctx_clip, image_tokens->batch_f32, raw, n_embd_out, out_embd.data()); + } } else { if (image_tokens->is_placeholder()) { LOG_ERR("%s: image tokens batch is placeholder\n", __func__); diff --git a/tools/mtmd/tests/test-1-positive.png b/tools/mtmd/tests/test-1-positive.png new file mode 100644 index 0000000000..007614594e Binary files /dev/null and b/tools/mtmd/tests/test-1-positive.png differ diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py index 5f5fef765a..f641045355 100644 --- a/tools/mtmd/tests/test-deepseek-ocr.py +++ b/tools/mtmd/tests/test-deepseek-ocr.py @@ -82,6 +82,24 @@ CASES = [ # is one pixel off and lands at ~0.69 instead. hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0, ), + TestCase( + model_key="v1", label="multi-tile (dynamic resolution)", + image="tools/mtmd/tests/test-1-positive.png", + ground_truth="tools/mtmd/tests/test-1-ground-truth.txt", + # 429x806 -- 806 > 640 triggers the v1 "Gundam" path: (1,2) grid -> + # 2 local 640 tiles + 1 global 1024 view. Regression guard for the + # tiling preprocessor -- a broken tile path craters the score. + # hf_cer/hf_chrf are HF v1's measured scores -- it reads this clean crop exactly. + hf_cer=0.0000, hf_chrf=100.00, cer_tol=0.03, chrf_tol=3.0, + ), + TestCase( + model_key="v2", label="multi-tile (dynamic resolution)", + image="tools/mtmd/tests/test-1-positive.png", + ground_truth="tools/mtmd/tests/test-1-ground-truth.txt", + # 429x806 -- 806 > 768 triggers the v2 path: (1,2) grid -> + # 2 local 768 tiles + 1 global 1024 view = 545 image tokens. + hf_cer=0.0236, hf_chrf=97.05, cer_tol=0.03, chrf_tol=3.0, + ), ]