mtmd: deepseek-ocr v1 multi-tile dynamic resolution + unified image-preprocessors for both versions (ds-ocr v1 and v2)

This commit is contained in:
Saba Fallah
2026-06-15 11:57:03 +02:00
committed by Xuan Son Nguyen
parent 74ade52741
commit d4bbef8083
9 changed files with 157 additions and 106 deletions
+1
View File
@@ -70,6 +70,7 @@ struct clip_hparams {
std::vector<clip_image_size> image_res_candidates;
int32_t preproc_min_tiles = 0;
int32_t preproc_max_tiles = 0;
int32_t preproc_tile_size = 0; // local tile size (deepseek-ocr)
resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
pad_style image_pad_rf = PAD_CEIL; // padding style for the refined image (e.g. llava-1.6)
+27 -4
View File
@@ -1569,7 +1569,16 @@ struct clip_model_loader {
get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
hparams.preproc_min_tiles = 2;
if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR) {
hparams.preproc_max_tiles = 9;
hparams.preproc_tile_size = 640;
// the CLIP/ViT body runs its layernorms at 1e-5 (the SAM stage uses 1e-6)
hparams.eps = 1e-5f;
}
if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
hparams.preproc_max_tiles = 6;
hparams.preproc_tile_size = 768;
// qwen2 encoder is GQA, requires KEY_N_HEAD_KV
get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv);
}
@@ -3182,6 +3191,16 @@ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batc
return batch->entries[idx].get();
}
std::vector<float> clip_get_newline_embd(const struct clip_ctx * ctx) {
const ggml_tensor * nl = ctx->model.image_newline;
if (nl == nullptr || nl->type != GGML_TYPE_F32) {
return {};
}
std::vector<float> out(ggml_nelements(nl));
ggml_backend_tensor_get(nl, out.data(), 0, ggml_nbytes(nl));
return out;
}
void clip_free(clip_ctx * ctx) {
if (ctx == nullptr) {
return;
@@ -3222,6 +3241,9 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
return (img->nx() / params.patch_size) / 2;
case PROJECTOR_TYPE_STEP3VL:
return img->nx() / (params.patch_size * params.n_merge);
case PROJECTOR_TYPE_DEEPSEEKOCR:
case PROJECTOR_TYPE_DEEPSEEKOCR2:
return (img->nx() / params.patch_size) / 4;
default:
break;
}
@@ -3431,10 +3453,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
// E.g., 64x64 -> 16x16 patches
n_patches /= 16;
// build_global_local_features adds image newlines and view separator
// Formula: h*(w+1) + 1 where h = w = sqrt(n_patches)
int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
n_patches = h * (h + 1) + 1;
if (img->add_viewsep) {
// global view: one image-newline per token-row + trailing view separator
const int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
n_patches = h * (h + 1) + 1;
}
} break;
case PROJECTOR_TYPE_HUNYUANVL:
{
+3
View File
@@ -97,6 +97,9 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
// read the image-newline embedding from the backend; empty if the model has none
std::vector<float> clip_get_newline_embd(const struct clip_ctx * ctx);
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, std::vector<float> & out_vec);
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, std::vector<float> & out_batch_embd);
+16 -13
View File
@@ -96,6 +96,8 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
const int n_heads = hparams.sam_n_head;
const int d_heads = n_embd / n_heads;
const int window = hparams.attn_window_size;
// SAM stage runs its layernorms at 1e-6
const float sam_eps = 1e-6f;
ggml_tensor * inpL;
@@ -134,7 +136,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
ggml_tensor * shortcut = cur;
// layernorm1
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, sam_eps, il);
const int64_t w0 = cur->ne[1];
const int64_t h0 = cur->ne[2];
@@ -214,7 +216,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
ggml_tensor * inpFF = cur;
// layernorm2
cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, sam_eps, il);
// ffn
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
@@ -229,12 +231,12 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, sam_eps, -1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, sam_eps, -1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
@@ -303,16 +305,17 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
cur = ggml_add(ctx0, cur, model.mm_fc_b);
const auto h = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
const auto w = h;
const auto n_dim = cur->ne[0];
// global view: weave one newline per row + trailing view separator
if (img.add_viewsep) {
const auto h = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
const auto w = h;
const auto n_dim = cur->ne[0];
ggml_tensor * imgnl;
imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, h*(w+1) + 1)
ggml_tensor * imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, h*(w+1) + 1)
}
cb(cur, "dsocr_output", -1);
+21 -59
View File
@@ -1112,46 +1112,7 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
// mtmd_image_preprocessor_deepseekocr
//
bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
// TODO: support 512 (tiny) and 640 (small) once we have eval data for them
const int64_t orig_area = static_cast<int64_t>(img.get_size().area());
size_t mode_i = 0;
int64_t min_diff = std::numeric_limits<int64_t>::max();
for (size_t i = 0; i < std::size(native_resolutions); i++) {
const int64_t r = native_resolutions[i];
const int64_t diff = std::abs(orig_area - r * r);
if (diff < min_diff) {
mode_i = i;
min_diff = diff;
}
}
const int image_size = native_resolutions[mode_i];
// Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for
// byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor.
clip_image_u8 padded;
img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
PAD_NEAREST, hparams.image_pad_color);
clip_image_f32_ptr res(clip_image_f32_init());
img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std);
output.entries.push_back(std::move(res));
output.grid_x = 1;
output.grid_y = 1;
return true;
}
//
// mtmd_image_preprocessor_deepseekocr2
//
// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles
// sorted by tile count
std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ratios() {
std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr::get_target_ratios() const {
std::vector<clip_image_size> ratios;
for (int n = min_tiles; n <= max_tiles; n++) {
for (int w = 1; w <= n; w++) {
@@ -1178,13 +1139,11 @@ std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ra
return ratios;
}
// pick the grid whose aspect ratio is closest to the image
// on a tie, prefer the larger grid when the image fits
clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
clip_image_size mtmd_image_preprocessor_deepseekocr::find_closest_aspect_ratio(
float aspect_ratio,
const std::vector<clip_image_size> & target_ratios,
int width,
int height) {
int height) const {
float best_ratio_diff = std::numeric_limits<float>::max();
clip_image_size best_ratio = { 1, 1 };
const float area = static_cast<float>(width * height);
@@ -1205,23 +1164,26 @@ clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
return best_ratio;
}
bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
// emit 768x768 local tiles when the image is larger than a tile in either
// dimension, then always a 1024x1024 global view. order: [tiles..., global].
bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
// output order: [local tiles..., global]
int grid_w = 1;
int grid_h = 1;
const auto img_size = img.get_size();
if (img_size.width > tile_size || img_size.height > tile_size) {
const float aspect_ratio = static_cast<float>(img_size.width) / img_size.height;
const auto target_ratios = get_target_ratios();
const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height);
const clip_image_size grid =
find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height);
grid_w = grid.width;
grid_h = grid.height;
// stretch onto the grid (no aspect preserve), then crop tiles row-major.
clip_image_u8 refined;
img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height },
RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE);
img_tool::resize(img, refined, { tile_size * grid_w, tile_size * grid_h }, RESIZE_ALGO_BICUBIC_PILLOW,
PAD_NONE);
for (int row = 0; row < grid.height; row++) {
for (int col = 0; col < grid.width; col++) {
for (int row = 0; row < grid_h; row++) {
for (int col = 0; col < grid_w; col++) {
clip_image_u8 tile;
img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size);
clip_image_f32_ptr res(clip_image_f32_init());
@@ -1231,17 +1193,17 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img,
}
}
// global view: aspect-preserving fit-and-pad to base_size.
// global view: aspect-preserving fit-and-pad to base_size
clip_image_u8 padded;
img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
PAD_NEAREST, hparams.image_pad_color);
img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW, PAD_NEAREST,
hparams.image_pad_color);
clip_image_f32_ptr global(clip_image_f32_init());
img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std);
global->add_viewsep = true;
output.entries.push_back(std::move(global));
output.grid_x = 1;
output.grid_y = 1;
output.grid_x = grid_w;
output.grid_y = grid_h;
LOG_DBG("%s: grid size: %d x %d (%d tiles) + global view\n", __func__, grid_w, grid_h, grid_w * grid_h);
return true;
}
+17 -19
View File
@@ -139,29 +139,27 @@ struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
// DeepSeek-OCR (v1/v2) global view + optional local tile grid
struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local
// tiles when the image is larger than a tile in either dimension.
struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor {
static constexpr int base_size = 1024; // global view
static constexpr int tile_size = 768; // local tile
static constexpr int min_tiles = 2;
static constexpr int max_tiles = 6;
mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx)
: mtmd_image_preprocessor(ctx),
base_size(hparams.image_size),
tile_size(hparams.preproc_tile_size),
min_tiles(hparams.preproc_min_tiles),
max_tiles(hparams.preproc_max_tiles) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
private:
static std::vector<clip_image_size> get_target_ratios();
static clip_image_size find_closest_aspect_ratio(
float aspect_ratio,
const std::vector<clip_image_size> & target_ratios,
int width,
int height);
int base_size; // global view
int tile_size; // each tile
int min_tiles;
int max_tiles;
std::vector<clip_image_size> get_target_ratios() const;
clip_image_size find_closest_aspect_ratio(
float aspect_ratio,
const std::vector<clip_image_size> & target_ratios,
int width, int height) const;
};
// custom image preprocessing for Step3VL
+54 -11
View File
@@ -612,14 +612,10 @@ struct mtmd_context {
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
} break;
case PROJECTOR_TYPE_DEEPSEEKOCR:
{
img_end = "\n"; // prevent empty batch on llama-server
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
} break;
case PROJECTOR_TYPE_DEEPSEEKOCR2:
{
img_end = "\n"; // prevent empty batch on llama-server
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
} break;
case PROJECTOR_TYPE_HUNYUANVL:
{
@@ -1169,11 +1165,18 @@ struct mtmd_tokenizer {
} else {
size_t n_tokens = 0;
for (const auto & e : batch_f32.entries) {
n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) {
// [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
break;
if (ctx->proj_type_v() == PROJECTOR_TYPE_DEEPSEEKOCR && batch_f32.entries.size() > 1) {
// v1 weaves the local tiles into a grid (one image-newline per token-row), then the global view
const int h = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
n_tokens = (h * batch_f32.grid_x + 1) * (h * batch_f32.grid_y);
n_tokens += clip_n_output_tokens(ctx->ctx_v, batch_f32.entries.back().get());
} else {
for (const auto & e : batch_f32.entries) {
n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) {
// [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
break;
}
}
}
@@ -1399,6 +1402,38 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
}
}
// Stitch the tiles in raw, one newline per token-row, append the overview (raw's last chunk).
// Example, 2x2 grid of tiles A B / C D:
// raw = [ A B C D <overview> ]
// out = A.row0 B.row0 n, A.row1 B.row1 n, ..., C.row0 D.row0 n, ..., <overview>
static void stitch_tile_grid(clip_ctx * ctx, const clip_image_f32_batch & batch,
const std::vector<float> & raw, int n_embd, float * out) {
const auto & entries = batch.entries;
const int n_tiles = static_cast<int>(entries.size()) - 1; // overview is last
GGML_ASSERT(n_tiles == batch.grid_x * batch.grid_y);
const int tile_h = clip_n_output_tokens_x(ctx, entries[0].get());
const size_t row_sz = static_cast<size_t>(tile_h) * n_embd;
const size_t tile_sz = static_cast<size_t>(tile_h) * row_sz;
const std::vector<float> newline = clip_get_newline_embd(ctx);
GGML_ASSERT(!newline.empty());
for (int r = 0; r < batch.grid_y; r++) {
for (int pr = 0; pr < tile_h; pr++) {
for (int c = 0; c < batch.grid_x; c++) {
const float * tile = raw.data() + static_cast<size_t>(r * batch.grid_x + c) * tile_sz;
memcpy(out, tile + static_cast<size_t>(pr) * row_sz, row_sz * sizeof(float));
out += row_sz;
}
memcpy(out, newline.data(), static_cast<size_t>(n_embd) * sizeof(float));
out += n_embd;
}
}
// overview = raw's last encoded chunk; size it from the entry, not raw.size() (raw is over-allocated)
const size_t global_off = static_cast<size_t>(n_tiles) * tile_sz;
const size_t global_sz = static_cast<size_t>(clip_n_output_tokens(ctx, entries.back().get())) * n_embd;
memcpy(out, raw.data() + global_off, global_sz * sizeof(float));
}
static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> & out_embd) {
clip_ctx * ctx_clip = ctx->ctx_v;
if (!ctx_clip) {
@@ -1417,12 +1452,17 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
|| proj_type == PROJECTOR_TYPE_MINICPMV
|| proj_type == PROJECTOR_TYPE_GLM_EDGE
|| proj_type == PROJECTOR_TYPE_INTERNVL
|| proj_type == PROJECTOR_TYPE_DEEPSEEKOCR
|| proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2
|| proj_type == PROJECTOR_TYPE_GRANITE4_VISION) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
const auto & entries = image_tokens->batch_f32.entries;
// entries may have different token counts
// e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
// DeepSeek-OCR v1, when multi-view, weaves its tiles into a grid (see stitch_tile_grid)
const bool is_dsocr_mlt = proj_type == PROJECTOR_TYPE_DEEPSEEKOCR && entries.size() > 1;
std::vector<float> raw(is_dsocr_mlt ? static_cast<size_t>(n_embd_out) * n_tokens_out : 0);
float * dst = is_dsocr_mlt ? raw.data() : out_embd.data();
size_t offset = 0;
for (size_t i = 0; i < entries.size(); i++) {
if (entries[i]->is_placeholder()) {
@@ -1441,9 +1481,12 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
return 1;
}
ok = true;
std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
std::copy(tmp_embd.begin(), tmp_embd.end(), dst + offset);
offset += static_cast<size_t>(n_embd_out) * n_tokens_per_image;
}
if (is_dsocr_mlt) {
stitch_tile_grid(ctx_clip, image_tokens->batch_f32, raw, n_embd_out, out_embd.data());
}
} else {
if (image_tokens->is_placeholder()) {
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
Binary file not shown.

After

Width:  |  Height:  |  Size: 225 KiB

+18
View File
@@ -82,6 +82,24 @@ CASES = [
# is one pixel off and lands at ~0.69 instead.
hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
),
TestCase(
model_key="v1", label="multi-tile (dynamic resolution)",
image="tools/mtmd/tests/test-1-positive.png",
ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
# 429x806 -- 806 > 640 triggers the v1 "Gundam" path: (1,2) grid ->
# 2 local 640 tiles + 1 global 1024 view. Regression guard for the
# tiling preprocessor -- a broken tile path craters the score.
# hf_cer/hf_chrf are HF v1's measured scores -- it reads this clean crop exactly.
hf_cer=0.0000, hf_chrf=100.00, cer_tol=0.03, chrf_tol=3.0,
),
TestCase(
model_key="v2", label="multi-tile (dynamic resolution)",
image="tools/mtmd/tests/test-1-positive.png",
ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
# 429x806 -- 806 > 768 triggers the v2 path: (1,2) grid ->
# 2 local 768 tiles + 1 global 1024 view = 545 image tokens.
hf_cer=0.0236, hf_chrf=97.05, cer_tol=0.03, chrf_tol=3.0,
),
]