mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-26 06:10:19 +00:00
mtmd: refactor llava-uhd overview image handling (always use ov_img_first) (#24769)
* add dedicated "overview" for mtmd_image_preproc_out * corrections * correct (again) * nits * nits (2)
This commit is contained in:
+34
-36
@@ -26,6 +26,13 @@ void mtmd_image_preproc_out::append(const clip_hparams & hparams, clip_image_f32
|
||||
entries.push_back(std::move(img));
|
||||
}
|
||||
|
||||
void mtmd_image_preproc_out::append_overview(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized) {
|
||||
overview.from_u8(img);
|
||||
if (normalized) {
|
||||
overview.normalize(hparams.image_mean, hparams.image_std);
|
||||
}
|
||||
}
|
||||
|
||||
// set of tools to manipulate images
|
||||
// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
|
||||
struct img_tool {
|
||||
@@ -607,10 +614,11 @@ private:
|
||||
mtmd_image_preproc_out mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img) {
|
||||
const clip_image_size original_size = img.get_size();
|
||||
auto const inst = get_slice_instructions(original_size);
|
||||
std::vector<clip_image_u8> imgs = slice_image(img, inst);
|
||||
auto sliced = slice_image(img, inst);
|
||||
|
||||
mtmd_image_preproc_out output;
|
||||
output.append(hparams, imgs, true);
|
||||
output.append_overview(hparams, sliced.overview, true);
|
||||
output.append(hparams, sliced.slices, true);
|
||||
output.grid_x = inst.grid_size.width;
|
||||
output.grid_y = inst.grid_size.height;
|
||||
|
||||
@@ -722,22 +730,15 @@ mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_ll
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<clip_image_u8> mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst, bool overview_first) {
|
||||
std::vector<clip_image_u8> output;
|
||||
mtmd_image_preprocessor_llava_uhd::slice_output mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst) {
|
||||
slice_output output;
|
||||
|
||||
// resize to overview size
|
||||
clip_image_u8 resized_img;
|
||||
img_tool::resize(img, resized_img, inst.overview_size, hparams.image_resize_algo_ov,
|
||||
img_tool::resize(img, output.overview, inst.overview_size, hparams.image_resize_algo_ov,
|
||||
hparams.image_pad_ov, hparams.image_pad_color_ov);
|
||||
if (overview_first) {
|
||||
output.push_back(resized_img);
|
||||
}
|
||||
|
||||
if (inst.slices.empty()) {
|
||||
// no slices, just return the resized image
|
||||
if (!overview_first) {
|
||||
output.push_back(resized_img);
|
||||
}
|
||||
// no slices, just return the overview image
|
||||
return output;
|
||||
}
|
||||
|
||||
@@ -755,11 +756,7 @@ std::vector<clip_image_u8> mtmd_image_preprocessor_llava_uhd::slice_image(const
|
||||
|
||||
clip_image_u8 img_slice;
|
||||
img_tool::crop(refined_img, img_slice, x, y, w, h);
|
||||
output.push_back(std::move(img_slice));
|
||||
}
|
||||
|
||||
if (!overview_first) {
|
||||
output.push_back(resized_img);
|
||||
output.slices.push_back(std::move(img_slice));
|
||||
}
|
||||
|
||||
return output;
|
||||
@@ -1077,10 +1074,11 @@ mtmd_image_preproc_out mtmd_image_preprocessor_idefics3::preprocess(const clip_i
|
||||
});
|
||||
}
|
||||
}
|
||||
auto imgs = slice_image(img, instructions);
|
||||
auto sliced = slice_image(img, instructions);
|
||||
|
||||
mtmd_image_preproc_out output;
|
||||
output.append(hparams, imgs, true);
|
||||
output.append_overview(hparams, sliced.overview, true);
|
||||
output.append(hparams, sliced.slices, true);
|
||||
output.grid_x = instructions.grid_size.width;
|
||||
output.grid_y = instructions.grid_size.height;
|
||||
return output;
|
||||
@@ -1094,10 +1092,12 @@ mtmd_image_preproc_out mtmd_image_preprocessor_internvl::preprocess(const clip_i
|
||||
GGML_ASSERT(!hparams.image_res_candidates.empty());
|
||||
const clip_image_size original_size = img.get_size();
|
||||
auto const inst = get_slice_instructions(original_size);
|
||||
std::vector<clip_image_u8> imgs = slice_image(img, inst, false);
|
||||
auto sliced = slice_image(img, inst);
|
||||
|
||||
mtmd_image_preproc_out output;
|
||||
output.append(hparams, imgs, true);
|
||||
// InternVL: slices first, then overview
|
||||
output.append(hparams, sliced.slices, true);
|
||||
output.append_overview(hparams, sliced.overview, true);
|
||||
output.grid_x = inst.grid_size.width;
|
||||
output.grid_y = inst.grid_size.height;
|
||||
return output;
|
||||
@@ -1131,9 +1131,10 @@ mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr::preprocess(const cli
|
||||
img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
|
||||
PAD_NEAREST, hparams.image_pad_color);
|
||||
mtmd_image_preproc_out output;
|
||||
output.append(hparams, padded, true);
|
||||
output.grid_x = 1;
|
||||
output.grid_y = 1;
|
||||
output.append_overview(hparams, padded, true);
|
||||
output.grid_x = 0;
|
||||
output.grid_y = 0;
|
||||
// TODO @ngxson : support slicing for DeepSeek-OCR, to do in another PR
|
||||
return output;
|
||||
}
|
||||
|
||||
@@ -1226,10 +1227,8 @@ mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr2::preprocess(const cl
|
||||
clip_image_u8 padded;
|
||||
img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
|
||||
PAD_NEAREST, hparams.image_pad_color);
|
||||
output.append(hparams, padded, true);
|
||||
output.entries.back().add_viewsep = true;
|
||||
output.grid_x = 1;
|
||||
output.grid_y = 1;
|
||||
output.append_overview(hparams, padded, true);
|
||||
output.overview.add_viewsep = true;
|
||||
return output;
|
||||
}
|
||||
|
||||
@@ -1447,15 +1446,14 @@ mtmd_image_preproc_out mtmd_image_preprocessor_step3vl::preprocess(const clip_im
|
||||
const auto instructions = build_slice_instructions(hparams, prepared.get_size());
|
||||
|
||||
mtmd_image_preproc_out output;
|
||||
clip_image_f32 overview_f32;
|
||||
// overview (normalized f32, already includes mean/std)
|
||||
img_u8_resize_bilinear_to_f32(
|
||||
prepared,
|
||||
overview_f32,
|
||||
output.overview,
|
||||
hparams.image_size,
|
||||
hparams.image_size,
|
||||
hparams.image_mean,
|
||||
hparams.image_std);
|
||||
output.append(hparams, overview_f32, false);
|
||||
|
||||
if (instructions.slices.empty()) {
|
||||
output.grid_x = 0;
|
||||
@@ -1548,13 +1546,13 @@ mtmd_image_preproc_out mtmd_image_preprocessor_youtuvl::preprocess(const clip_im
|
||||
|
||||
mtmd_image_preproc_out mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img) {
|
||||
auto output = mtmd_image_preprocessor_llava_uhd::preprocess(img);
|
||||
if (output.entries.size() == 1) {
|
||||
if (output.entries.size() == 0) {
|
||||
// Single-tile (overview only): append one newline row.
|
||||
output.entries[0].add_newline = true;
|
||||
output.overview.add_newline = true;
|
||||
} else {
|
||||
// Multi-tile: overview gets no newline, grid tiles get one.
|
||||
output.entries[0].add_newline = false;
|
||||
for (size_t i = 1; i < output.entries.size(); ++i) {
|
||||
output.overview.add_newline = false;
|
||||
for (size_t i = 0; i < output.entries.size(); ++i) {
|
||||
output.entries[i].add_newline = true;
|
||||
}
|
||||
}
|
||||
|
||||
+15
-1
@@ -11,11 +11,19 @@
|
||||
struct mtmd_image_preproc_out {
|
||||
std::vector<clip_image_f32> entries;
|
||||
// grid size is required for llava-uhd style models
|
||||
|
||||
clip_image_f32 overview; // overview image (downscaled image)
|
||||
int grid_x = 0;
|
||||
int grid_y = 0;
|
||||
|
||||
void append(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized = true);
|
||||
void append(const clip_hparams & hparams, const std::vector<clip_image_u8> & imgs, bool normalized = true);
|
||||
void append(const clip_hparams & hparams, clip_image_f32 & img, bool normalized = true);
|
||||
|
||||
void append_overview(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized = true);
|
||||
bool has_overview() const {
|
||||
return overview.nx() > 0 || overview.ny() > 0;
|
||||
}
|
||||
};
|
||||
|
||||
// base class, models must inherit from this class
|
||||
@@ -46,6 +54,8 @@ struct mtmd_image_preprocessor {
|
||||
* [overview] --> [slice 1] --> [slice 2]
|
||||
* | |
|
||||
* +--> [slice 3] --> [slice 4]
|
||||
*
|
||||
* NOTE: for the ordering of overview, set "ov_img_first" on the mtmd_context
|
||||
*/
|
||||
struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
@@ -67,7 +77,11 @@ struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
|
||||
// LFM2 override this function to implement its custom slicing logic
|
||||
virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);
|
||||
|
||||
std::vector<clip_image_u8> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);
|
||||
struct slice_output {
|
||||
clip_image_u8 overview;
|
||||
std::vector<clip_image_u8> slices;
|
||||
};
|
||||
slice_output slice_image(const clip_image_u8 & img, const slice_instructions & inst);
|
||||
|
||||
private:
|
||||
clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);
|
||||
|
||||
+64
-51
@@ -516,6 +516,7 @@ struct mtmd_context {
|
||||
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||
ov_img_first = false;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
{
|
||||
@@ -539,6 +540,7 @@ struct mtmd_context {
|
||||
img_beg = "<img>";
|
||||
img_end = "</img>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_internvl>(ctx_v);
|
||||
ov_img_first = false;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
@@ -615,11 +617,13 @@ struct mtmd_context {
|
||||
{
|
||||
img_end = "\n"; // prevent empty batch on llama-server
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
|
||||
ov_img_first = false;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR2:
|
||||
{
|
||||
img_end = "\n"; // prevent empty batch on llama-server
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
|
||||
ov_img_first = false;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
@@ -640,6 +644,7 @@ struct mtmd_context {
|
||||
img_beg = "<image>";
|
||||
img_end = "";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_granite>(ctx_v);
|
||||
ov_img_first = true;
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
|
||||
@@ -1079,26 +1084,38 @@ struct mtmd_tokenizer {
|
||||
|
||||
// for llava-uhd style, we need to handle grid too
|
||||
// we don't care about overwriting these values for now because the case where bitmaps.size() > 1 is only for frame merging (qwen-vl), not supported by llava-uhd
|
||||
if (tmp_preproc_out.grid_x > 0 && tmp_preproc_out.grid_y > 0) {
|
||||
if ((tmp_preproc_out.grid_x > 0 && tmp_preproc_out.grid_y > 0)
|
||||
|| tmp_preproc_out.has_overview()) {
|
||||
GGML_ASSERT(bitmaps.size() == 1);
|
||||
preproc_out.grid_x = tmp_preproc_out.grid_x;
|
||||
preproc_out.grid_y = tmp_preproc_out.grid_y;
|
||||
preproc_out.overview = std::move(tmp_preproc_out.overview);
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("%s: preproc_out has %zu entries, grid_x = %d, grid_y = %d, has_overview = %d\n",
|
||||
__func__, preproc_out.entries.size(), preproc_out.grid_x, preproc_out.grid_y,
|
||||
preproc_out.has_overview() ? 1 : 0);
|
||||
|
||||
// handle llava-uhd style preprocessing
|
||||
const bool has_tiling_grid = preproc_out.grid_x > 0 && preproc_out.grid_y > 0;
|
||||
// (output either a grid, or overview-only)
|
||||
const bool has_tiling_grid = (preproc_out.grid_x > 0 && preproc_out.grid_y > 0)
|
||||
|| preproc_out.has_overview();
|
||||
|
||||
if (has_tiling_grid) {
|
||||
// [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
|
||||
GGML_ASSERT(bitmaps.size() == 1);
|
||||
|
||||
const int n_col = preproc_out.grid_x;
|
||||
const int n_row = preproc_out.grid_y;
|
||||
|
||||
// split batch into chunks of single images
|
||||
// NOTE: preproc_out will be invalidated after this call
|
||||
auto chunks = split_batch_to_chunk(std::move(preproc_out), bitmaps[0]->id);
|
||||
GGML_ASSERT(chunks.size() > 0);
|
||||
|
||||
// NOTE: preproc_out is invalidated after this point, do not use it anymore
|
||||
|
||||
// split_batch_to_chunk must always put the overview image first
|
||||
auto ov_chunk = std::move(chunks.front());
|
||||
chunks.erase(chunks.begin());
|
||||
|
||||
@@ -1125,7 +1142,16 @@ struct mtmd_tokenizer {
|
||||
std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
|
||||
add_text(std::string(buf.get(), buf.get() + sz - 1), true);
|
||||
}
|
||||
cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
|
||||
|
||||
auto & curr_chunk = chunks[y * n_col + x];
|
||||
auto & curr_batch = curr_chunk.tokens_image->batch_f32;
|
||||
if (curr_batch.entries.size() != 1) {
|
||||
throw std::runtime_error(string_format("%s: expect 1 image in batch_f32", __func__));
|
||||
}
|
||||
|
||||
LOG_DBG("%s: adding slice image at row %d col %d\n", __func__, y, x);
|
||||
cur.entries.emplace_back(std::move(curr_chunk));
|
||||
|
||||
add_text(ctx->tok_sli_img_end);
|
||||
if (!is_last_in_row) {
|
||||
add_text(ctx->tok_sli_img_mid);
|
||||
@@ -1147,6 +1173,11 @@ struct mtmd_tokenizer {
|
||||
|
||||
} else {
|
||||
|
||||
if (preproc_out.entries.size() == 0) {
|
||||
LOG_ERR("%s: no image tokens produced by preprocessor (ref: https://github.com/ggml-org/llama.cpp/pull/24769)\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
|
||||
size_t n_tokens = 0;
|
||||
for (auto & e : preproc_out.entries) {
|
||||
n_tokens += clip_n_output_tokens(ctx->ctx_v, &e);
|
||||
@@ -1303,13 +1334,15 @@ struct mtmd_tokenizer {
|
||||
std::vector<mtmd_input_chunk> split_batch_to_chunk(mtmd_image_preproc_out && preproc_out, const std::string & id) {
|
||||
std::vector<mtmd_input_chunk> chunks;
|
||||
|
||||
for (auto & entry : preproc_out.entries) {
|
||||
auto process_chunk = [&](clip_image_f32 && img) {
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, &entry);
|
||||
image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, &img);
|
||||
image_tokens->ny = 1;
|
||||
image_tokens->batch_f32.entries.push_back(std::move(entry));
|
||||
image_tokens->batch_f32.entries.push_back(std::move(img));
|
||||
image_tokens->id = id;
|
||||
|
||||
GGML_ASSERT(image_tokens->nx > 0);
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
{}, // text tokens
|
||||
@@ -1317,6 +1350,21 @@ struct mtmd_tokenizer {
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
chunks.emplace_back(std::move(chunk));
|
||||
};
|
||||
|
||||
// overview image first
|
||||
auto & overview = preproc_out.overview;
|
||||
if (overview.nx() == 0 || overview.ny() == 0) {
|
||||
throw std::runtime_error(string_format("%s: invalid overview image for llava-uhd style preprocessing\n", __func__));
|
||||
}
|
||||
process_chunk(std::move(preproc_out.overview));
|
||||
|
||||
// then, process slices
|
||||
for (auto & entry : preproc_out.entries) {
|
||||
if (entry.nx() == 0 || entry.ny() == 0) {
|
||||
throw std::runtime_error(string_format("%s: invalid image slice for llava-uhd style preprocessing\n", __func__));
|
||||
}
|
||||
process_chunk(std::move(entry));
|
||||
}
|
||||
|
||||
return chunks;
|
||||
@@ -1390,57 +1438,22 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
|
||||
LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
auto proj_type = clip_get_projector_type(ctx_clip);
|
||||
|
||||
int n_embd_out = ctx->n_embd_out();
|
||||
auto n_tokens_out = image_tokens->n_tokens();
|
||||
out_embd.resize((size_t)n_embd_out * n_tokens_out);
|
||||
|
||||
bool ok = false;
|
||||
|
||||
if (clip_is_llava(ctx_clip)
|
||||
|| proj_type == PROJECTOR_TYPE_MINICPMV
|
||||
|| proj_type == PROJECTOR_TYPE_GLM_EDGE
|
||||
|| proj_type == PROJECTOR_TYPE_INTERNVL
|
||||
|| proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2
|
||||
|| proj_type == PROJECTOR_TYPE_GRANITE4_VISION) {
|
||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
||||
const auto & entries = image_tokens->batch_f32.entries;
|
||||
// entries may have different token counts
|
||||
// e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
|
||||
size_t offset = 0;
|
||||
for (size_t i = 0; i < entries.size(); i++) {
|
||||
if (entries[i].is_placeholder()) {
|
||||
LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i);
|
||||
return 1;
|
||||
}
|
||||
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, &entries[i]);
|
||||
std::vector<float> tmp_embd((size_t)n_tokens_per_image * n_embd_out);
|
||||
bool ok_i = clip_image_encode(
|
||||
ctx_clip,
|
||||
ctx->n_threads,
|
||||
&entries[i],
|
||||
tmp_embd);
|
||||
if (!ok_i) {
|
||||
LOG_ERR("%s: failed to encode image %zu\n", __func__, i);
|
||||
return 1;
|
||||
}
|
||||
ok = true;
|
||||
std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
|
||||
offset += static_cast<size_t>(n_embd_out) * n_tokens_per_image;
|
||||
}
|
||||
} else {
|
||||
if (image_tokens->is_placeholder()) {
|
||||
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
ok = clip_image_batch_encode(
|
||||
ctx_clip,
|
||||
ctx->n_threads,
|
||||
&image_tokens->batch_f32,
|
||||
out_embd);
|
||||
if (image_tokens->is_placeholder()) {
|
||||
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
bool ok = clip_image_batch_encode(
|
||||
ctx_clip,
|
||||
ctx->n_threads,
|
||||
&image_tokens->batch_f32,
|
||||
out_embd);
|
||||
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user