Files
llama.cpp/tools/mtmd/models/granite4-vision.cpp
Gabe Goodhart a3900a6694 model: Granite Speech Plus (#24818)
* feat: Add conversion support for Granite Speech Plus

Branch: GraniteSpeechPlus
AI-usage: full (Bob, OpenCode + Qwen3.6-35b)
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Extend granite_speech to support plus multi-layer concatenation

Branch: GraniteSpeechPlus
AI-usage: draft (Bob, OpenCode + Qwen3.6-35b)
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(conversion): Fix plural naming for feature_layers for audio

Branch: GraniteSpeechPlus
AI-usage: none
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(mtmd): Align feature_layer usage and naming everywhere

Branch: GraniteSpeechPlus
AI-usage: none
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* style: Use fstring for log

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>

---------

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
2026-06-23 12:03:31 +02:00

340 lines
12 KiB
C++

#include "models.h"
#include "../clip-impl.h"
#include "../clip-model.h"
#include <algorithm>
#include <cmath>
#include <cstring>
#include <string>
#include <vector>
/*
* Granite Vision 4.1 clip graph
*
* Stage 1a: SigLIP vision tower (N layers, post-norm)
* Stage 1b: WindowQFormer blocks (deepstack + spatial)
* Stage 1c: Concatenate and pack outputs
* Stage 1d: Append newline tokens if add_newline is set
*/
// ---------------------------------------------------------------------------
// Member method implementations
// ---------------------------------------------------------------------------
ggml_tensor * clip_graph_granite4_vision::gather(
ggml_tensor * src,
const std::string & name,
int idx_len) {
ggml_tensor * idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, idx_len);
ggml_set_name(idx, name.c_str());
ggml_set_input(idx);
return ggml_get_rows(ctx0, src, idx);
}
ggml_tensor * clip_graph_granite4_vision::interp_down(
ggml_tensor * src,
int side,
int new_side) {
const int n_embd = src->ne[0];
ggml_tensor * t = ggml_reshape_4d(ctx0, src, n_embd, side, side, 1);
t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 0, 1, 3));
const int kernel = side / new_side;
t = ggml_pool_2d(ctx0, t, GGML_OP_POOL_AVG, kernel, kernel, kernel, kernel, 0, 0);
t = ggml_cont(ctx0, ggml_permute(ctx0, t, 1, 2, 0, 3));
return ggml_reshape_2d(ctx0, t, n_embd, new_side * new_side);
}
// ---------------------------------------------------------------------------
// build_block - WindowQFormer block implementation
// ---------------------------------------------------------------------------
ggml_tensor * clip_graph_granite4_vision::build_block(
const qf_block & blk,
ggml_tensor * h,
int bid,
int spatial_offset,
int image_side,
int window_side,
int query_side,
float qformer_eps) {
const int n_embd = h->ne[0];
GGML_ASSERT(h->ne[1] == image_side * image_side);
const int n = image_side / window_side;
const int new_side = n * query_side;
const int n_windows = n * n;
const int enc_len = window_side * window_side;
const int query_len = query_side * query_side;
auto cbx = [&](ggml_tensor * & t, const char * step) {
const std::string name = "g4v_blk" + std::to_string(bid) + "_" + step;
ggml_set_name(t, name.c_str());
};
// 1. Top-level LN
cbx(h, "inp");
ggml_tensor * x = build_norm(h, blk.qf_proj_norm_w, blk.qf_proj_norm_b, NORM_TYPE_NORMAL, eps, bid);
cbx(x, "norm");
// 2. enc = _win(x, image_side, window_side)
ggml_tensor * enc;
{
ggml_tensor * enc_flat = gather(x,
"g4v_blk" + std::to_string(bid) + "_win_idx",
image_side * image_side);
enc = ggml_reshape_3d(ctx0, enc_flat, n_embd, enc_len, n_windows);
}
cbx(enc, "enc");
// 3. downsampled = downsampler(x)
ggml_tensor * d;
(void) spatial_offset;
if (spatial_offset >= 0) {
d = gather(x,
"g4v_blk" + std::to_string(bid) + "_spatial_idx",
new_side * new_side);
} else {
d = interp_down(x, image_side, new_side);
}
cbx(d, "downsampled");
// 4. query_embeds = query + _win(d, new_side, query_side)
ggml_tensor * q_in;
{
ggml_tensor * dw_flat = gather(d,
"g4v_blk" + std::to_string(bid) + "_qwin_idx",
new_side * new_side);
ggml_tensor * dw = ggml_reshape_3d(ctx0, dw_flat, n_embd, query_len, n_windows);
q_in = ggml_add(ctx0, dw, blk.qf_proj_query);
}
cbx(q_in, "query_embeds");
// 5. encoder_embeds = enc + image_positions → (C, enc_len, n_windows)
ggml_tensor * e_in = ggml_add(ctx0, enc, blk.qf_proj_img_pos);
cbx(e_in, "encoder_embeds");
// 6. Qformer forward.
ggml_tensor * q = build_norm(q_in, blk.qf_proj_post_norm_w, blk.qf_proj_post_norm_b, NORM_TYPE_NORMAL, qformer_eps, bid);
// Helper for linear projections with window batching
auto linear = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) -> ggml_tensor * {
ggml_tensor * t = ggml_reshape_2d(ctx0, x, x->ne[0], x->ne[1] * x->ne[2]);
t = build_mm(w, t);
if (b) t = ggml_add(ctx0, t, b);
return t;
};
// Get the single QFormer layer
GGML_ASSERT(blk.qf_proj_layers.size() == 1);
const auto & pl = blk.qf_proj_layers[0];
// 6a. Self-attention
ggml_tensor * sa_out;
{
const int d_h = 64;
const int n_head = n_embd / d_h;
const int nq = q->ne[1];
const float scale = 1.0f / std::sqrt((float) d_h);
ggml_tensor * Q = linear(q, pl.q_w, pl.q_b);
ggml_tensor * K = linear(q, pl.k_w, pl.k_b);
ggml_tensor * V = linear(q, pl.v_w, pl.v_b);
Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows);
K = ggml_reshape_4d(ctx0, K, d_h, n_head, nq, n_windows);
V = ggml_reshape_4d(ctx0, V, d_h, n_head, nq, n_windows);
sa_out = build_attn(pl.o_w, pl.o_b, Q, K, V, nullptr, scale, bid);
sa_out = ggml_reshape_3d(ctx0, sa_out, n_embd, nq, n_windows);
sa_out = ggml_add(ctx0, sa_out, q);
sa_out = build_norm(sa_out, pl.ln_1_w, pl.ln_1_b,
NORM_TYPE_NORMAL, qformer_eps, bid);
}
cbx(sa_out, "sa_out");
// 6b. Cross-attention
ggml_tensor * ca_out;
{
const int d_h = 64;
const int n_head = n_embd / d_h;
const int nq = sa_out->ne[1];
const int nkv = e_in->ne[1];
const float scale = 1.0f / std::sqrt((float) d_h);
ggml_tensor * Q = linear(sa_out, pl.cross_attn_q_w, pl.cross_attn_q_b);
ggml_tensor * K = linear(e_in, pl.cross_attn_k_w, pl.cross_attn_k_b);
ggml_tensor * V = linear(e_in, pl.cross_attn_v_w, pl.cross_attn_v_b);
Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows);
K = ggml_reshape_4d(ctx0, K, d_h, n_head, nkv, n_windows);
V = ggml_reshape_4d(ctx0, V, d_h, n_head, nkv, n_windows);
ca_out = build_attn(pl.cross_attn_o_w, pl.cross_attn_o_b,
Q, K, V, nullptr, scale, bid);
ca_out = ggml_reshape_3d(ctx0, ca_out, n_embd, nq, n_windows);
ca_out = ggml_add(ctx0, ca_out, sa_out);
ca_out = build_norm(ca_out, pl.cross_attn_norm_w, pl.cross_attn_norm_b,
NORM_TYPE_NORMAL, qformer_eps, bid);
}
cbx(ca_out, "ca_out");
// 6c. FFN
ggml_tensor * ffn;
{
ggml_tensor * t = ggml_reshape_2d(ctx0, ca_out, n_embd, query_len * n_windows);
t = build_mm(pl.ff_up_w, t);
if (pl.ff_up_b) t = ggml_add(ctx0, t, pl.ff_up_b);
t = ggml_gelu_erf(ctx0, t);
t = build_mm(pl.ff_down_w, t);
if (pl.ff_down_b) t = ggml_add(ctx0, t, pl.ff_down_b);
t = ggml_reshape_3d(ctx0, t, n_embd, query_len, n_windows);
ffn = ggml_add(ctx0, t, ca_out);
ffn = build_norm(ffn, pl.ln_2_w, pl.ln_2_b, NORM_TYPE_NORMAL, qformer_eps, bid);
}
cbx(ffn, "qformer_out");
// 7. _unwin back to raster
ggml_tensor * unwinned;
{
ggml_tensor * flat = ggml_reshape_2d(ctx0, ffn, n_embd, query_len * n_windows);
unwinned = gather(flat,
"g4v_blk" + std::to_string(bid) + "_unwin_idx",
new_side * new_side);
}
cbx(unwinned, "unwin");
// 8. out_linear
ggml_tensor * out = build_mm(blk.qf_proj_linear_w, unwinned);
if (blk.qf_proj_linear_b) out = ggml_add(ctx0, out, blk.qf_proj_linear_b);
cbx(out, "out");
return out;
}
// ---------------------------------------------------------------------------
// build() - top-level graph
// ---------------------------------------------------------------------------
// Build the K-tiled, base-scaled newline row tensor.
// Shape: (n_mmproj_embd, 1)
ggml_tensor * clip_graph_granite4_vision::build_newline_row(ggml_context * ctx0) {
const int K = (int) model.qf_proj_blocks.size();
GGML_ASSERT(K > 0);
GGML_ASSERT(n_mmproj_embd % K == 0);
const int projection_dim = n_mmproj_embd / K;
GGML_ASSERT(model.image_newline != nullptr);
GGML_ASSERT(ggml_nelements(model.image_newline) == projection_dim);
// Build newline_row[k*projection_dim + d] = nl[d] * (k == 0 ? base : 1.0)
ggml_tensor * nl = model.image_newline; // (projection_dim,)
ggml_tensor * nl_first_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1);
ggml_tensor * nl_row_2d;
if (K == 1) {
nl_row_2d = nl_first_2d;
} else {
ggml_tensor * nl_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1);
ggml_tensor * rest_template = ggml_new_tensor_2d(
ctx0, GGML_TYPE_F32, projection_dim, K - 1);
ggml_tensor * nl_rest = ggml_repeat(ctx0, nl_2d, rest_template);
nl_row_2d = ggml_concat(ctx0, nl_first_2d, nl_rest, 1); // (projection_dim, K)
}
nl_row_2d = ggml_cont(ctx0, nl_row_2d);
return ggml_reshape_2d(ctx0, nl_row_2d, n_mmproj_embd, 1);
}
// Append a single newline row at the end of the tile output.
ggml_tensor * clip_graph_granite4_vision::append_rowwise_newlines(ggml_context * ctx0, ggml_tensor * tile_output) {
// For the single-tile case, append one newline row at the end.
// For the multi-tile rowwise case, this will be called per-tile
// (though currently only the single-tile path uses it).
ggml_tensor * nl_row = build_newline_row(ctx0);
return ggml_concat(ctx0, tile_output, nl_row, 1);
}
ggml_cgraph * clip_graph_granite4_vision::build() {
GGML_ASSERT(model.patch_embeddings_0 != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
GGML_ASSERT(model.class_embedding == nullptr);
GGML_ASSERT(!model.qf_proj_blocks.empty());
// --- Stage 1a: SigLIP encoder producing intermediate hidden states ---
ggml_tensor * inp = build_inp();
inp = ggml_add(ctx0, inp, model.position_embeddings);
cb(inp, "pos_embed", -1);
ggml_tensor * inpL = inp;
std::vector<ggml_tensor *> layer_outs(n_layer, nullptr);
for (int il = 0; il < n_layer; ++il) {
const auto & layer = model.layers[il];
ggml_tensor * cur = inpL;
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
// Self-attention
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
if (layer.q_b) Qcur = ggml_add(ctx0, Qcur, layer.q_b);
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
if (layer.k_b) Kcur = ggml_add(ctx0, Kcur, layer.k_b);
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
if (layer.v_b) Vcur = ggml_add(ctx0, Vcur, layer.v_b);
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
cur = ggml_add(ctx0, cur, inpL);
inpL = cur;
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
cur = build_ffn(cur,
layer.ff_up_w, layer.ff_up_b,
layer.ff_gate_w, layer.ff_gate_b,
layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
cur = ggml_add(ctx0, inpL, cur);
cb(cur, "layer_out", il);
layer_outs[il] = cur;
inpL = cur;
}
// --- Stage 1b/1c: WindowQFormer blocks ---
const int projector_count = hparams.feature_layers.size();
const float qformer_eps = 1e-12f;
ggml_tensor * mmproj = nullptr;
for (int bid = 0; bid < projector_count; ++bid) {
const auto & blk = model.qf_proj_blocks[bid];
int vlayer = hparams.feature_layers[bid];
GGML_ASSERT(vlayer >= 0 && vlayer < n_layer);
ggml_tensor * h = layer_outs[vlayer];
ggml_tensor * stream = build_block(
blk, h, bid,
hparams.proj_spatial_offsets[bid],
n_patches_x,
hparams.downsample_window_side,
hparams.downsample_query_side,
qformer_eps);
cb(stream, (std::string("proj_") + std::to_string(bid) + std::string("_v_out")).c_str(), vlayer);
mmproj = mmproj ? ggml_concat(ctx0, mmproj, stream, 0) : stream;
}
// --- Stage 1d: Append newline tokens if add_newline is set ---
if (add_newline) {
mmproj = append_rowwise_newlines(ctx0, mmproj);
ggml_set_name(mmproj, "g4v_mmproj_out_nl");
} else {
ggml_set_name(mmproj, "g4v_mmproj_out");
}
ggml_build_forward_expand(gf, mmproj);
return gf;
}