mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-26 06:10:19 +00:00
a3900a6694
* feat: Add conversion support for Granite Speech Plus Branch: GraniteSpeechPlus AI-usage: full (Bob, OpenCode + Qwen3.6-35b) Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Extend granite_speech to support plus multi-layer concatenation Branch: GraniteSpeechPlus AI-usage: draft (Bob, OpenCode + Qwen3.6-35b) Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(conversion): Fix plural naming for feature_layers for audio Branch: GraniteSpeechPlus AI-usage: none Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(mtmd): Align feature_layer usage and naming everywhere Branch: GraniteSpeechPlus AI-usage: none Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * style: Use fstring for log Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
340 lines
12 KiB
C++
340 lines
12 KiB
C++
#include "models.h"
|
|
#include "../clip-impl.h"
|
|
#include "../clip-model.h"
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <cstring>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
/*
|
|
* Granite Vision 4.1 clip graph
|
|
*
|
|
* Stage 1a: SigLIP vision tower (N layers, post-norm)
|
|
* Stage 1b: WindowQFormer blocks (deepstack + spatial)
|
|
* Stage 1c: Concatenate and pack outputs
|
|
* Stage 1d: Append newline tokens if add_newline is set
|
|
*/
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Member method implementations
|
|
// ---------------------------------------------------------------------------
|
|
|
|
ggml_tensor * clip_graph_granite4_vision::gather(
|
|
ggml_tensor * src,
|
|
const std::string & name,
|
|
int idx_len) {
|
|
ggml_tensor * idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, idx_len);
|
|
ggml_set_name(idx, name.c_str());
|
|
ggml_set_input(idx);
|
|
return ggml_get_rows(ctx0, src, idx);
|
|
}
|
|
|
|
ggml_tensor * clip_graph_granite4_vision::interp_down(
|
|
ggml_tensor * src,
|
|
int side,
|
|
int new_side) {
|
|
const int n_embd = src->ne[0];
|
|
ggml_tensor * t = ggml_reshape_4d(ctx0, src, n_embd, side, side, 1);
|
|
t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 0, 1, 3));
|
|
const int kernel = side / new_side;
|
|
t = ggml_pool_2d(ctx0, t, GGML_OP_POOL_AVG, kernel, kernel, kernel, kernel, 0, 0);
|
|
t = ggml_cont(ctx0, ggml_permute(ctx0, t, 1, 2, 0, 3));
|
|
return ggml_reshape_2d(ctx0, t, n_embd, new_side * new_side);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// build_block - WindowQFormer block implementation
|
|
// ---------------------------------------------------------------------------
|
|
|
|
ggml_tensor * clip_graph_granite4_vision::build_block(
|
|
const qf_block & blk,
|
|
ggml_tensor * h,
|
|
int bid,
|
|
int spatial_offset,
|
|
int image_side,
|
|
int window_side,
|
|
int query_side,
|
|
float qformer_eps) {
|
|
|
|
const int n_embd = h->ne[0];
|
|
GGML_ASSERT(h->ne[1] == image_side * image_side);
|
|
const int n = image_side / window_side;
|
|
const int new_side = n * query_side;
|
|
const int n_windows = n * n;
|
|
const int enc_len = window_side * window_side;
|
|
const int query_len = query_side * query_side;
|
|
|
|
auto cbx = [&](ggml_tensor * & t, const char * step) {
|
|
const std::string name = "g4v_blk" + std::to_string(bid) + "_" + step;
|
|
ggml_set_name(t, name.c_str());
|
|
};
|
|
|
|
// 1. Top-level LN
|
|
cbx(h, "inp");
|
|
ggml_tensor * x = build_norm(h, blk.qf_proj_norm_w, blk.qf_proj_norm_b, NORM_TYPE_NORMAL, eps, bid);
|
|
cbx(x, "norm");
|
|
|
|
// 2. enc = _win(x, image_side, window_side)
|
|
ggml_tensor * enc;
|
|
{
|
|
ggml_tensor * enc_flat = gather(x,
|
|
"g4v_blk" + std::to_string(bid) + "_win_idx",
|
|
image_side * image_side);
|
|
enc = ggml_reshape_3d(ctx0, enc_flat, n_embd, enc_len, n_windows);
|
|
}
|
|
cbx(enc, "enc");
|
|
|
|
// 3. downsampled = downsampler(x)
|
|
ggml_tensor * d;
|
|
(void) spatial_offset;
|
|
if (spatial_offset >= 0) {
|
|
d = gather(x,
|
|
"g4v_blk" + std::to_string(bid) + "_spatial_idx",
|
|
new_side * new_side);
|
|
} else {
|
|
d = interp_down(x, image_side, new_side);
|
|
}
|
|
cbx(d, "downsampled");
|
|
|
|
// 4. query_embeds = query + _win(d, new_side, query_side)
|
|
ggml_tensor * q_in;
|
|
{
|
|
ggml_tensor * dw_flat = gather(d,
|
|
"g4v_blk" + std::to_string(bid) + "_qwin_idx",
|
|
new_side * new_side);
|
|
ggml_tensor * dw = ggml_reshape_3d(ctx0, dw_flat, n_embd, query_len, n_windows);
|
|
q_in = ggml_add(ctx0, dw, blk.qf_proj_query);
|
|
}
|
|
cbx(q_in, "query_embeds");
|
|
|
|
// 5. encoder_embeds = enc + image_positions → (C, enc_len, n_windows)
|
|
ggml_tensor * e_in = ggml_add(ctx0, enc, blk.qf_proj_img_pos);
|
|
cbx(e_in, "encoder_embeds");
|
|
|
|
// 6. Qformer forward.
|
|
ggml_tensor * q = build_norm(q_in, blk.qf_proj_post_norm_w, blk.qf_proj_post_norm_b, NORM_TYPE_NORMAL, qformer_eps, bid);
|
|
|
|
// Helper for linear projections with window batching
|
|
auto linear = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) -> ggml_tensor * {
|
|
ggml_tensor * t = ggml_reshape_2d(ctx0, x, x->ne[0], x->ne[1] * x->ne[2]);
|
|
t = build_mm(w, t);
|
|
if (b) t = ggml_add(ctx0, t, b);
|
|
return t;
|
|
};
|
|
|
|
// Get the single QFormer layer
|
|
GGML_ASSERT(blk.qf_proj_layers.size() == 1);
|
|
const auto & pl = blk.qf_proj_layers[0];
|
|
|
|
// 6a. Self-attention
|
|
ggml_tensor * sa_out;
|
|
{
|
|
const int d_h = 64;
|
|
const int n_head = n_embd / d_h;
|
|
const int nq = q->ne[1];
|
|
const float scale = 1.0f / std::sqrt((float) d_h);
|
|
|
|
ggml_tensor * Q = linear(q, pl.q_w, pl.q_b);
|
|
ggml_tensor * K = linear(q, pl.k_w, pl.k_b);
|
|
ggml_tensor * V = linear(q, pl.v_w, pl.v_b);
|
|
|
|
Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows);
|
|
K = ggml_reshape_4d(ctx0, K, d_h, n_head, nq, n_windows);
|
|
V = ggml_reshape_4d(ctx0, V, d_h, n_head, nq, n_windows);
|
|
|
|
sa_out = build_attn(pl.o_w, pl.o_b, Q, K, V, nullptr, scale, bid);
|
|
sa_out = ggml_reshape_3d(ctx0, sa_out, n_embd, nq, n_windows);
|
|
|
|
sa_out = ggml_add(ctx0, sa_out, q);
|
|
sa_out = build_norm(sa_out, pl.ln_1_w, pl.ln_1_b,
|
|
NORM_TYPE_NORMAL, qformer_eps, bid);
|
|
}
|
|
cbx(sa_out, "sa_out");
|
|
|
|
// 6b. Cross-attention
|
|
ggml_tensor * ca_out;
|
|
{
|
|
const int d_h = 64;
|
|
const int n_head = n_embd / d_h;
|
|
const int nq = sa_out->ne[1];
|
|
const int nkv = e_in->ne[1];
|
|
const float scale = 1.0f / std::sqrt((float) d_h);
|
|
|
|
ggml_tensor * Q = linear(sa_out, pl.cross_attn_q_w, pl.cross_attn_q_b);
|
|
ggml_tensor * K = linear(e_in, pl.cross_attn_k_w, pl.cross_attn_k_b);
|
|
ggml_tensor * V = linear(e_in, pl.cross_attn_v_w, pl.cross_attn_v_b);
|
|
|
|
Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows);
|
|
K = ggml_reshape_4d(ctx0, K, d_h, n_head, nkv, n_windows);
|
|
V = ggml_reshape_4d(ctx0, V, d_h, n_head, nkv, n_windows);
|
|
|
|
ca_out = build_attn(pl.cross_attn_o_w, pl.cross_attn_o_b,
|
|
Q, K, V, nullptr, scale, bid);
|
|
ca_out = ggml_reshape_3d(ctx0, ca_out, n_embd, nq, n_windows);
|
|
|
|
ca_out = ggml_add(ctx0, ca_out, sa_out);
|
|
ca_out = build_norm(ca_out, pl.cross_attn_norm_w, pl.cross_attn_norm_b,
|
|
NORM_TYPE_NORMAL, qformer_eps, bid);
|
|
}
|
|
cbx(ca_out, "ca_out");
|
|
|
|
// 6c. FFN
|
|
ggml_tensor * ffn;
|
|
{
|
|
ggml_tensor * t = ggml_reshape_2d(ctx0, ca_out, n_embd, query_len * n_windows);
|
|
t = build_mm(pl.ff_up_w, t);
|
|
if (pl.ff_up_b) t = ggml_add(ctx0, t, pl.ff_up_b);
|
|
t = ggml_gelu_erf(ctx0, t);
|
|
t = build_mm(pl.ff_down_w, t);
|
|
if (pl.ff_down_b) t = ggml_add(ctx0, t, pl.ff_down_b);
|
|
t = ggml_reshape_3d(ctx0, t, n_embd, query_len, n_windows);
|
|
ffn = ggml_add(ctx0, t, ca_out);
|
|
ffn = build_norm(ffn, pl.ln_2_w, pl.ln_2_b, NORM_TYPE_NORMAL, qformer_eps, bid);
|
|
}
|
|
cbx(ffn, "qformer_out");
|
|
|
|
// 7. _unwin back to raster
|
|
ggml_tensor * unwinned;
|
|
{
|
|
ggml_tensor * flat = ggml_reshape_2d(ctx0, ffn, n_embd, query_len * n_windows);
|
|
unwinned = gather(flat,
|
|
"g4v_blk" + std::to_string(bid) + "_unwin_idx",
|
|
new_side * new_side);
|
|
}
|
|
cbx(unwinned, "unwin");
|
|
|
|
// 8. out_linear
|
|
ggml_tensor * out = build_mm(blk.qf_proj_linear_w, unwinned);
|
|
if (blk.qf_proj_linear_b) out = ggml_add(ctx0, out, blk.qf_proj_linear_b);
|
|
cbx(out, "out");
|
|
|
|
return out;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// build() - top-level graph
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// Build the K-tiled, base-scaled newline row tensor.
|
|
// Shape: (n_mmproj_embd, 1)
|
|
ggml_tensor * clip_graph_granite4_vision::build_newline_row(ggml_context * ctx0) {
|
|
const int K = (int) model.qf_proj_blocks.size();
|
|
GGML_ASSERT(K > 0);
|
|
GGML_ASSERT(n_mmproj_embd % K == 0);
|
|
const int projection_dim = n_mmproj_embd / K;
|
|
GGML_ASSERT(model.image_newline != nullptr);
|
|
GGML_ASSERT(ggml_nelements(model.image_newline) == projection_dim);
|
|
|
|
// Build newline_row[k*projection_dim + d] = nl[d] * (k == 0 ? base : 1.0)
|
|
ggml_tensor * nl = model.image_newline; // (projection_dim,)
|
|
ggml_tensor * nl_first_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1);
|
|
ggml_tensor * nl_row_2d;
|
|
if (K == 1) {
|
|
nl_row_2d = nl_first_2d;
|
|
} else {
|
|
ggml_tensor * nl_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1);
|
|
ggml_tensor * rest_template = ggml_new_tensor_2d(
|
|
ctx0, GGML_TYPE_F32, projection_dim, K - 1);
|
|
ggml_tensor * nl_rest = ggml_repeat(ctx0, nl_2d, rest_template);
|
|
nl_row_2d = ggml_concat(ctx0, nl_first_2d, nl_rest, 1); // (projection_dim, K)
|
|
}
|
|
nl_row_2d = ggml_cont(ctx0, nl_row_2d);
|
|
return ggml_reshape_2d(ctx0, nl_row_2d, n_mmproj_embd, 1);
|
|
}
|
|
|
|
// Append a single newline row at the end of the tile output.
|
|
ggml_tensor * clip_graph_granite4_vision::append_rowwise_newlines(ggml_context * ctx0, ggml_tensor * tile_output) {
|
|
// For the single-tile case, append one newline row at the end.
|
|
// For the multi-tile rowwise case, this will be called per-tile
|
|
// (though currently only the single-tile path uses it).
|
|
ggml_tensor * nl_row = build_newline_row(ctx0);
|
|
return ggml_concat(ctx0, tile_output, nl_row, 1);
|
|
}
|
|
|
|
ggml_cgraph * clip_graph_granite4_vision::build() {
|
|
GGML_ASSERT(model.patch_embeddings_0 != nullptr);
|
|
GGML_ASSERT(model.position_embeddings != nullptr);
|
|
GGML_ASSERT(model.class_embedding == nullptr);
|
|
GGML_ASSERT(!model.qf_proj_blocks.empty());
|
|
|
|
// --- Stage 1a: SigLIP encoder producing intermediate hidden states ---
|
|
ggml_tensor * inp = build_inp();
|
|
inp = ggml_add(ctx0, inp, model.position_embeddings);
|
|
cb(inp, "pos_embed", -1);
|
|
|
|
ggml_tensor * inpL = inp;
|
|
std::vector<ggml_tensor *> layer_outs(n_layer, nullptr);
|
|
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
const auto & layer = model.layers[il];
|
|
ggml_tensor * cur = inpL;
|
|
|
|
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
|
|
|
// Self-attention
|
|
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
|
if (layer.q_b) Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
|
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
|
if (layer.k_b) Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
|
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
|
if (layer.v_b) Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
|
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
|
|
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
|
|
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
|
|
|
|
cur = build_attn(layer.o_w, layer.o_b,
|
|
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
|
|
cur = ggml_add(ctx0, cur, inpL);
|
|
inpL = cur;
|
|
|
|
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
|
cur = build_ffn(cur,
|
|
layer.ff_up_w, layer.ff_up_b,
|
|
layer.ff_gate_w, layer.ff_gate_b,
|
|
layer.ff_down_w, layer.ff_down_b,
|
|
hparams.ffn_op, il);
|
|
cur = ggml_add(ctx0, inpL, cur);
|
|
cb(cur, "layer_out", il);
|
|
layer_outs[il] = cur;
|
|
inpL = cur;
|
|
}
|
|
|
|
// --- Stage 1b/1c: WindowQFormer blocks ---
|
|
const int projector_count = hparams.feature_layers.size();
|
|
const float qformer_eps = 1e-12f;
|
|
|
|
ggml_tensor * mmproj = nullptr;
|
|
for (int bid = 0; bid < projector_count; ++bid) {
|
|
const auto & blk = model.qf_proj_blocks[bid];
|
|
|
|
int vlayer = hparams.feature_layers[bid];
|
|
GGML_ASSERT(vlayer >= 0 && vlayer < n_layer);
|
|
ggml_tensor * h = layer_outs[vlayer];
|
|
|
|
ggml_tensor * stream = build_block(
|
|
blk, h, bid,
|
|
hparams.proj_spatial_offsets[bid],
|
|
n_patches_x,
|
|
hparams.downsample_window_side,
|
|
hparams.downsample_query_side,
|
|
qformer_eps);
|
|
cb(stream, (std::string("proj_") + std::to_string(bid) + std::string("_v_out")).c_str(), vlayer);
|
|
mmproj = mmproj ? ggml_concat(ctx0, mmproj, stream, 0) : stream;
|
|
}
|
|
|
|
// --- Stage 1d: Append newline tokens if add_newline is set ---
|
|
if (add_newline) {
|
|
mmproj = append_rowwise_newlines(ctx0, mmproj);
|
|
ggml_set_name(mmproj, "g4v_mmproj_out_nl");
|
|
} else {
|
|
ggml_set_name(mmproj, "g4v_mmproj_out");
|
|
}
|
|
ggml_build_forward_expand(gf, mmproj);
|
|
|
|
return gf;
|
|
}
|