mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-05 02:30:22 +00:00
mtmd: deepseek-ocr v1 multi-tile dynamic resolution + unified image-preprocessors for both versions (ds-ocr v1 and v2)
This commit is contained in:
committed by
Xuan Son Nguyen
parent
74ade52741
commit
d4bbef8083
@@ -70,6 +70,7 @@ struct clip_hparams {
|
||||
std::vector<clip_image_size> image_res_candidates;
|
||||
int32_t preproc_min_tiles = 0;
|
||||
int32_t preproc_max_tiles = 0;
|
||||
int32_t preproc_tile_size = 0; // local tile size (deepseek-ocr)
|
||||
resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
|
||||
resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
|
||||
pad_style image_pad_rf = PAD_CEIL; // padding style for the refined image (e.g. llava-1.6)
|
||||
|
||||
+27
-4
@@ -1569,7 +1569,16 @@ struct clip_model_loader {
|
||||
get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
|
||||
get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
|
||||
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
|
||||
hparams.preproc_min_tiles = 2;
|
||||
if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR) {
|
||||
hparams.preproc_max_tiles = 9;
|
||||
hparams.preproc_tile_size = 640;
|
||||
// the CLIP/ViT body runs its layernorms at 1e-5 (the SAM stage uses 1e-6)
|
||||
hparams.eps = 1e-5f;
|
||||
}
|
||||
if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
|
||||
hparams.preproc_max_tiles = 6;
|
||||
hparams.preproc_tile_size = 768;
|
||||
// qwen2 encoder is GQA, requires KEY_N_HEAD_KV
|
||||
get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv);
|
||||
}
|
||||
@@ -3182,6 +3191,16 @@ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batc
|
||||
return batch->entries[idx].get();
|
||||
}
|
||||
|
||||
std::vector<float> clip_get_newline_embd(const struct clip_ctx * ctx) {
|
||||
const ggml_tensor * nl = ctx->model.image_newline;
|
||||
if (nl == nullptr || nl->type != GGML_TYPE_F32) {
|
||||
return {};
|
||||
}
|
||||
std::vector<float> out(ggml_nelements(nl));
|
||||
ggml_backend_tensor_get(nl, out.data(), 0, ggml_nbytes(nl));
|
||||
return out;
|
||||
}
|
||||
|
||||
void clip_free(clip_ctx * ctx) {
|
||||
if (ctx == nullptr) {
|
||||
return;
|
||||
@@ -3222,6 +3241,9 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
return (img->nx() / params.patch_size) / 2;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
return img->nx() / (params.patch_size * params.n_merge);
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR2:
|
||||
return (img->nx() / params.patch_size) / 4;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -3431,10 +3453,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
// E.g., 64x64 -> 16x16 patches
|
||||
n_patches /= 16;
|
||||
|
||||
// build_global_local_features adds image newlines and view separator
|
||||
// Formula: h*(w+1) + 1 where h = w = sqrt(n_patches)
|
||||
int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
|
||||
n_patches = h * (h + 1) + 1;
|
||||
if (img->add_viewsep) {
|
||||
// global view: one image-newline per token-row + trailing view separator
|
||||
const int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
|
||||
n_patches = h * (h + 1) + 1;
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
|
||||
@@ -97,6 +97,9 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
|
||||
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
|
||||
struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
||||
|
||||
// read the image-newline embedding from the backend; empty if the model has none
|
||||
std::vector<float> clip_get_newline_embd(const struct clip_ctx * ctx);
|
||||
|
||||
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, std::vector<float> & out_vec);
|
||||
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, std::vector<float> & out_batch_embd);
|
||||
|
||||
|
||||
@@ -96,6 +96,8 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
|
||||
const int n_heads = hparams.sam_n_head;
|
||||
const int d_heads = n_embd / n_heads;
|
||||
const int window = hparams.attn_window_size;
|
||||
// SAM stage runs its layernorms at 1e-6
|
||||
const float sam_eps = 1e-6f;
|
||||
|
||||
ggml_tensor * inpL;
|
||||
|
||||
@@ -134,7 +136,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
|
||||
ggml_tensor * shortcut = cur;
|
||||
|
||||
// layernorm1
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, sam_eps, il);
|
||||
|
||||
const int64_t w0 = cur->ne[1];
|
||||
const int64_t h0 = cur->ne[2];
|
||||
@@ -214,7 +216,7 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
|
||||
ggml_tensor * inpFF = cur;
|
||||
|
||||
// layernorm2
|
||||
cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, sam_eps, il);
|
||||
|
||||
// ffn
|
||||
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
|
||||
@@ -229,12 +231,12 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
|
||||
|
||||
cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
||||
cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
|
||||
cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, sam_eps, -1);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
||||
|
||||
cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
||||
cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
|
||||
cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, sam_eps, -1);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
||||
|
||||
cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
|
||||
@@ -303,16 +305,17 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
|
||||
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_fc_b);
|
||||
|
||||
const auto h = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
|
||||
const auto w = h;
|
||||
const auto n_dim = cur->ne[0];
|
||||
// global view: weave one newline per row + trailing view separator
|
||||
if (img.add_viewsep) {
|
||||
const auto h = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
|
||||
const auto w = h;
|
||||
const auto n_dim = cur->ne[0];
|
||||
|
||||
ggml_tensor * imgnl;
|
||||
|
||||
imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
|
||||
cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
|
||||
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, h*(w+1) + 1)
|
||||
ggml_tensor * imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
|
||||
cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
|
||||
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, h*(w+1) + 1)
|
||||
}
|
||||
|
||||
cb(cur, "dsocr_output", -1);
|
||||
|
||||
|
||||
+21
-59
@@ -1112,46 +1112,7 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
|
||||
// mtmd_image_preprocessor_deepseekocr
|
||||
//
|
||||
|
||||
bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
||||
static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
|
||||
// TODO: support 512 (tiny) and 640 (small) once we have eval data for them
|
||||
|
||||
const int64_t orig_area = static_cast<int64_t>(img.get_size().area());
|
||||
|
||||
size_t mode_i = 0;
|
||||
int64_t min_diff = std::numeric_limits<int64_t>::max();
|
||||
for (size_t i = 0; i < std::size(native_resolutions); i++) {
|
||||
const int64_t r = native_resolutions[i];
|
||||
const int64_t diff = std::abs(orig_area - r * r);
|
||||
if (diff < min_diff) {
|
||||
mode_i = i;
|
||||
min_diff = diff;
|
||||
}
|
||||
}
|
||||
const int image_size = native_resolutions[mode_i];
|
||||
|
||||
// Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for
|
||||
// byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor.
|
||||
clip_image_u8 padded;
|
||||
img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
|
||||
PAD_NEAREST, hparams.image_pad_color);
|
||||
|
||||
clip_image_f32_ptr res(clip_image_f32_init());
|
||||
img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std);
|
||||
output.entries.push_back(std::move(res));
|
||||
|
||||
output.grid_x = 1;
|
||||
output.grid_y = 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// mtmd_image_preprocessor_deepseekocr2
|
||||
//
|
||||
|
||||
// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles
|
||||
// sorted by tile count
|
||||
std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ratios() {
|
||||
std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr::get_target_ratios() const {
|
||||
std::vector<clip_image_size> ratios;
|
||||
for (int n = min_tiles; n <= max_tiles; n++) {
|
||||
for (int w = 1; w <= n; w++) {
|
||||
@@ -1178,13 +1139,11 @@ std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ra
|
||||
return ratios;
|
||||
}
|
||||
|
||||
// pick the grid whose aspect ratio is closest to the image
|
||||
// on a tie, prefer the larger grid when the image fits
|
||||
clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
|
||||
clip_image_size mtmd_image_preprocessor_deepseekocr::find_closest_aspect_ratio(
|
||||
float aspect_ratio,
|
||||
const std::vector<clip_image_size> & target_ratios,
|
||||
int width,
|
||||
int height) {
|
||||
int height) const {
|
||||
float best_ratio_diff = std::numeric_limits<float>::max();
|
||||
clip_image_size best_ratio = { 1, 1 };
|
||||
const float area = static_cast<float>(width * height);
|
||||
@@ -1205,23 +1164,26 @@ clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
|
||||
return best_ratio;
|
||||
}
|
||||
|
||||
bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
||||
// emit 768x768 local tiles when the image is larger than a tile in either
|
||||
// dimension, then always a 1024x1024 global view. order: [tiles..., global].
|
||||
bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
||||
// output order: [local tiles..., global]
|
||||
|
||||
int grid_w = 1;
|
||||
int grid_h = 1;
|
||||
const auto img_size = img.get_size();
|
||||
if (img_size.width > tile_size || img_size.height > tile_size) {
|
||||
const float aspect_ratio = static_cast<float>(img_size.width) / img_size.height;
|
||||
const auto target_ratios = get_target_ratios();
|
||||
const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height);
|
||||
const clip_image_size grid =
|
||||
find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height);
|
||||
grid_w = grid.width;
|
||||
grid_h = grid.height;
|
||||
|
||||
// stretch onto the grid (no aspect preserve), then crop tiles row-major.
|
||||
clip_image_u8 refined;
|
||||
img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height },
|
||||
RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE);
|
||||
img_tool::resize(img, refined, { tile_size * grid_w, tile_size * grid_h }, RESIZE_ALGO_BICUBIC_PILLOW,
|
||||
PAD_NONE);
|
||||
|
||||
for (int row = 0; row < grid.height; row++) {
|
||||
for (int col = 0; col < grid.width; col++) {
|
||||
for (int row = 0; row < grid_h; row++) {
|
||||
for (int col = 0; col < grid_w; col++) {
|
||||
clip_image_u8 tile;
|
||||
img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size);
|
||||
clip_image_f32_ptr res(clip_image_f32_init());
|
||||
@@ -1231,17 +1193,17 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img,
|
||||
}
|
||||
}
|
||||
|
||||
// global view: aspect-preserving fit-and-pad to base_size.
|
||||
// global view: aspect-preserving fit-and-pad to base_size
|
||||
clip_image_u8 padded;
|
||||
img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
|
||||
PAD_NEAREST, hparams.image_pad_color);
|
||||
img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW, PAD_NEAREST,
|
||||
hparams.image_pad_color);
|
||||
clip_image_f32_ptr global(clip_image_f32_init());
|
||||
img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std);
|
||||
global->add_viewsep = true;
|
||||
output.entries.push_back(std::move(global));
|
||||
|
||||
output.grid_x = 1;
|
||||
output.grid_y = 1;
|
||||
output.grid_x = grid_w;
|
||||
output.grid_y = grid_h;
|
||||
LOG_DBG("%s: grid size: %d x %d (%d tiles) + global view\n", __func__, grid_w, grid_h, grid_w * grid_h);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
+17
-19
@@ -139,29 +139,27 @@ struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// DeepSeek-OCR (v1/v2) global view + optional local tile grid
|
||||
struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local
|
||||
// tiles when the image is larger than a tile in either dimension.
|
||||
struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor {
|
||||
static constexpr int base_size = 1024; // global view
|
||||
static constexpr int tile_size = 768; // local tile
|
||||
static constexpr int min_tiles = 2;
|
||||
static constexpr int max_tiles = 6;
|
||||
|
||||
mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx)
|
||||
: mtmd_image_preprocessor(ctx),
|
||||
base_size(hparams.image_size),
|
||||
tile_size(hparams.preproc_tile_size),
|
||||
min_tiles(hparams.preproc_min_tiles),
|
||||
max_tiles(hparams.preproc_max_tiles) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
|
||||
private:
|
||||
static std::vector<clip_image_size> get_target_ratios();
|
||||
static clip_image_size find_closest_aspect_ratio(
|
||||
float aspect_ratio,
|
||||
const std::vector<clip_image_size> & target_ratios,
|
||||
int width,
|
||||
int height);
|
||||
int base_size; // global view
|
||||
int tile_size; // each tile
|
||||
int min_tiles;
|
||||
int max_tiles;
|
||||
|
||||
std::vector<clip_image_size> get_target_ratios() const;
|
||||
clip_image_size find_closest_aspect_ratio(
|
||||
float aspect_ratio,
|
||||
const std::vector<clip_image_size> & target_ratios,
|
||||
int width, int height) const;
|
||||
};
|
||||
|
||||
// custom image preprocessing for Step3VL
|
||||
|
||||
+54
-11
@@ -612,14 +612,10 @@ struct mtmd_context {
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
{
|
||||
img_end = "\n"; // prevent empty batch on llama-server
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR2:
|
||||
{
|
||||
img_end = "\n"; // prevent empty batch on llama-server
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
@@ -1169,11 +1165,18 @@ struct mtmd_tokenizer {
|
||||
} else {
|
||||
|
||||
size_t n_tokens = 0;
|
||||
for (const auto & e : batch_f32.entries) {
|
||||
n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
|
||||
if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) {
|
||||
// [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
|
||||
break;
|
||||
if (ctx->proj_type_v() == PROJECTOR_TYPE_DEEPSEEKOCR && batch_f32.entries.size() > 1) {
|
||||
// v1 weaves the local tiles into a grid (one image-newline per token-row), then the global view
|
||||
const int h = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
|
||||
n_tokens = (h * batch_f32.grid_x + 1) * (h * batch_f32.grid_y);
|
||||
n_tokens += clip_n_output_tokens(ctx->ctx_v, batch_f32.entries.back().get());
|
||||
} else {
|
||||
for (const auto & e : batch_f32.entries) {
|
||||
n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
|
||||
if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) {
|
||||
// [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1399,6 +1402,38 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
}
|
||||
}
|
||||
|
||||
// Stitch the tiles in raw, one newline per token-row, append the overview (raw's last chunk).
|
||||
// Example, 2x2 grid of tiles A B / C D:
|
||||
// raw = [ A B C D <overview> ]
|
||||
// out = A.row0 B.row0 n, A.row1 B.row1 n, ..., C.row0 D.row0 n, ..., <overview>
|
||||
static void stitch_tile_grid(clip_ctx * ctx, const clip_image_f32_batch & batch,
|
||||
const std::vector<float> & raw, int n_embd, float * out) {
|
||||
const auto & entries = batch.entries;
|
||||
const int n_tiles = static_cast<int>(entries.size()) - 1; // overview is last
|
||||
GGML_ASSERT(n_tiles == batch.grid_x * batch.grid_y);
|
||||
const int tile_h = clip_n_output_tokens_x(ctx, entries[0].get());
|
||||
const size_t row_sz = static_cast<size_t>(tile_h) * n_embd;
|
||||
const size_t tile_sz = static_cast<size_t>(tile_h) * row_sz;
|
||||
const std::vector<float> newline = clip_get_newline_embd(ctx);
|
||||
GGML_ASSERT(!newline.empty());
|
||||
|
||||
for (int r = 0; r < batch.grid_y; r++) {
|
||||
for (int pr = 0; pr < tile_h; pr++) {
|
||||
for (int c = 0; c < batch.grid_x; c++) {
|
||||
const float * tile = raw.data() + static_cast<size_t>(r * batch.grid_x + c) * tile_sz;
|
||||
memcpy(out, tile + static_cast<size_t>(pr) * row_sz, row_sz * sizeof(float));
|
||||
out += row_sz;
|
||||
}
|
||||
memcpy(out, newline.data(), static_cast<size_t>(n_embd) * sizeof(float));
|
||||
out += n_embd;
|
||||
}
|
||||
}
|
||||
// overview = raw's last encoded chunk; size it from the entry, not raw.size() (raw is over-allocated)
|
||||
const size_t global_off = static_cast<size_t>(n_tiles) * tile_sz;
|
||||
const size_t global_sz = static_cast<size_t>(clip_n_output_tokens(ctx, entries.back().get())) * n_embd;
|
||||
memcpy(out, raw.data() + global_off, global_sz * sizeof(float));
|
||||
}
|
||||
|
||||
static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> & out_embd) {
|
||||
clip_ctx * ctx_clip = ctx->ctx_v;
|
||||
if (!ctx_clip) {
|
||||
@@ -1417,12 +1452,17 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
|
||||
|| proj_type == PROJECTOR_TYPE_MINICPMV
|
||||
|| proj_type == PROJECTOR_TYPE_GLM_EDGE
|
||||
|| proj_type == PROJECTOR_TYPE_INTERNVL
|
||||
|| proj_type == PROJECTOR_TYPE_DEEPSEEKOCR
|
||||
|| proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2
|
||||
|| proj_type == PROJECTOR_TYPE_GRANITE4_VISION) {
|
||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
||||
const auto & entries = image_tokens->batch_f32.entries;
|
||||
// entries may have different token counts
|
||||
// e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
|
||||
// DeepSeek-OCR v1, when multi-view, weaves its tiles into a grid (see stitch_tile_grid)
|
||||
const bool is_dsocr_mlt = proj_type == PROJECTOR_TYPE_DEEPSEEKOCR && entries.size() > 1;
|
||||
std::vector<float> raw(is_dsocr_mlt ? static_cast<size_t>(n_embd_out) * n_tokens_out : 0);
|
||||
float * dst = is_dsocr_mlt ? raw.data() : out_embd.data();
|
||||
size_t offset = 0;
|
||||
for (size_t i = 0; i < entries.size(); i++) {
|
||||
if (entries[i]->is_placeholder()) {
|
||||
@@ -1441,9 +1481,12 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
|
||||
return 1;
|
||||
}
|
||||
ok = true;
|
||||
std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
|
||||
std::copy(tmp_embd.begin(), tmp_embd.end(), dst + offset);
|
||||
offset += static_cast<size_t>(n_embd_out) * n_tokens_per_image;
|
||||
}
|
||||
if (is_dsocr_mlt) {
|
||||
stitch_tile_grid(ctx_clip, image_tokens->batch_f32, raw, n_embd_out, out_embd.data());
|
||||
}
|
||||
} else {
|
||||
if (image_tokens->is_placeholder()) {
|
||||
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
|
||||
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 225 KiB |
@@ -82,6 +82,24 @@ CASES = [
|
||||
# is one pixel off and lands at ~0.69 instead.
|
||||
hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
|
||||
),
|
||||
TestCase(
|
||||
model_key="v1", label="multi-tile (dynamic resolution)",
|
||||
image="tools/mtmd/tests/test-1-positive.png",
|
||||
ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
|
||||
# 429x806 -- 806 > 640 triggers the v1 "Gundam" path: (1,2) grid ->
|
||||
# 2 local 640 tiles + 1 global 1024 view. Regression guard for the
|
||||
# tiling preprocessor -- a broken tile path craters the score.
|
||||
# hf_cer/hf_chrf are HF v1's measured scores -- it reads this clean crop exactly.
|
||||
hf_cer=0.0000, hf_chrf=100.00, cer_tol=0.03, chrf_tol=3.0,
|
||||
),
|
||||
TestCase(
|
||||
model_key="v2", label="multi-tile (dynamic resolution)",
|
||||
image="tools/mtmd/tests/test-1-positive.png",
|
||||
ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
|
||||
# 429x806 -- 806 > 768 triggers the v2 path: (1,2) grid ->
|
||||
# 2 local 768 tiles + 1 global 1024 view = 545 image tokens.
|
||||
hf_cer=0.0236, hf_chrf=97.05, cer_tol=0.03, chrf_tol=3.0,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user