#include "models.h" #include "../clip-impl.h" #include "../clip-model.h" #include #include #include #include #include /* * Granite Vision 4.1 clip graph * * Stage 1a: SigLIP vision tower (N layers, post-norm) * Stage 1b: WindowQFormer blocks (deepstack + spatial) * Stage 1c: Concatenate and pack outputs * Stage 1d: Append newline tokens if add_newline is set */ // --------------------------------------------------------------------------- // Member method implementations // --------------------------------------------------------------------------- ggml_tensor * clip_graph_granite4_vision::gather( ggml_tensor * src, const std::string & name, int idx_len) { ggml_tensor * idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, idx_len); ggml_set_name(idx, name.c_str()); ggml_set_input(idx); return ggml_get_rows(ctx0, src, idx); } ggml_tensor * clip_graph_granite4_vision::interp_down( ggml_tensor * src, int side, int new_side) { const int n_embd = src->ne[0]; ggml_tensor * t = ggml_reshape_4d(ctx0, src, n_embd, side, side, 1); t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 0, 1, 3)); const int kernel = side / new_side; t = ggml_pool_2d(ctx0, t, GGML_OP_POOL_AVG, kernel, kernel, kernel, kernel, 0, 0); t = ggml_cont(ctx0, ggml_permute(ctx0, t, 1, 2, 0, 3)); return ggml_reshape_2d(ctx0, t, n_embd, new_side * new_side); } // --------------------------------------------------------------------------- // build_block - WindowQFormer block implementation // --------------------------------------------------------------------------- ggml_tensor * clip_graph_granite4_vision::build_block( const qf_block & blk, ggml_tensor * h, int bid, int spatial_offset, int image_side, int window_side, int query_side, float qformer_eps) { const int n_embd = h->ne[0]; GGML_ASSERT(h->ne[1] == image_side * image_side); const int n = image_side / window_side; const int new_side = n * query_side; const int n_windows = n * n; const int enc_len = window_side * window_side; const int query_len = query_side * query_side; auto cbx = [&](ggml_tensor * & t, const char * step) { const std::string name = "g4v_blk" + std::to_string(bid) + "_" + step; ggml_set_name(t, name.c_str()); }; // 1. Top-level LN cbx(h, "inp"); ggml_tensor * x = build_norm(h, blk.qf_proj_norm_w, blk.qf_proj_norm_b, NORM_TYPE_NORMAL, eps, bid); cbx(x, "norm"); // 2. enc = _win(x, image_side, window_side) ggml_tensor * enc; { ggml_tensor * enc_flat = gather(x, "g4v_blk" + std::to_string(bid) + "_win_idx", image_side * image_side); enc = ggml_reshape_3d(ctx0, enc_flat, n_embd, enc_len, n_windows); } cbx(enc, "enc"); // 3. downsampled = downsampler(x) ggml_tensor * d; (void) spatial_offset; if (spatial_offset >= 0) { d = gather(x, "g4v_blk" + std::to_string(bid) + "_spatial_idx", new_side * new_side); } else { d = interp_down(x, image_side, new_side); } cbx(d, "downsampled"); // 4. query_embeds = query + _win(d, new_side, query_side) ggml_tensor * q_in; { ggml_tensor * dw_flat = gather(d, "g4v_blk" + std::to_string(bid) + "_qwin_idx", new_side * new_side); ggml_tensor * dw = ggml_reshape_3d(ctx0, dw_flat, n_embd, query_len, n_windows); q_in = ggml_add(ctx0, dw, blk.qf_proj_query); } cbx(q_in, "query_embeds"); // 5. encoder_embeds = enc + image_positions → (C, enc_len, n_windows) ggml_tensor * e_in = ggml_add(ctx0, enc, blk.qf_proj_img_pos); cbx(e_in, "encoder_embeds"); // 6. Qformer forward. ggml_tensor * q = build_norm(q_in, blk.qf_proj_post_norm_w, blk.qf_proj_post_norm_b, NORM_TYPE_NORMAL, qformer_eps, bid); // Helper for linear projections with window batching auto linear = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) -> ggml_tensor * { ggml_tensor * t = ggml_reshape_2d(ctx0, x, x->ne[0], x->ne[1] * x->ne[2]); t = build_mm(w, t); if (b) t = ggml_add(ctx0, t, b); return t; }; // Get the single QFormer layer GGML_ASSERT(blk.qf_proj_layers.size() == 1); const auto & pl = blk.qf_proj_layers[0]; // 6a. Self-attention ggml_tensor * sa_out; { const int d_h = 64; const int n_head = n_embd / d_h; const int nq = q->ne[1]; const float scale = 1.0f / std::sqrt((float) d_h); ggml_tensor * Q = linear(q, pl.q_w, pl.q_b); ggml_tensor * K = linear(q, pl.k_w, pl.k_b); ggml_tensor * V = linear(q, pl.v_w, pl.v_b); Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows); K = ggml_reshape_4d(ctx0, K, d_h, n_head, nq, n_windows); V = ggml_reshape_4d(ctx0, V, d_h, n_head, nq, n_windows); sa_out = build_attn(pl.o_w, pl.o_b, Q, K, V, nullptr, scale, bid); sa_out = ggml_reshape_3d(ctx0, sa_out, n_embd, nq, n_windows); sa_out = ggml_add(ctx0, sa_out, q); sa_out = build_norm(sa_out, pl.ln_1_w, pl.ln_1_b, NORM_TYPE_NORMAL, qformer_eps, bid); } cbx(sa_out, "sa_out"); // 6b. Cross-attention ggml_tensor * ca_out; { const int d_h = 64; const int n_head = n_embd / d_h; const int nq = sa_out->ne[1]; const int nkv = e_in->ne[1]; const float scale = 1.0f / std::sqrt((float) d_h); ggml_tensor * Q = linear(sa_out, pl.cross_attn_q_w, pl.cross_attn_q_b); ggml_tensor * K = linear(e_in, pl.cross_attn_k_w, pl.cross_attn_k_b); ggml_tensor * V = linear(e_in, pl.cross_attn_v_w, pl.cross_attn_v_b); Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows); K = ggml_reshape_4d(ctx0, K, d_h, n_head, nkv, n_windows); V = ggml_reshape_4d(ctx0, V, d_h, n_head, nkv, n_windows); ca_out = build_attn(pl.cross_attn_o_w, pl.cross_attn_o_b, Q, K, V, nullptr, scale, bid); ca_out = ggml_reshape_3d(ctx0, ca_out, n_embd, nq, n_windows); ca_out = ggml_add(ctx0, ca_out, sa_out); ca_out = build_norm(ca_out, pl.cross_attn_norm_w, pl.cross_attn_norm_b, NORM_TYPE_NORMAL, qformer_eps, bid); } cbx(ca_out, "ca_out"); // 6c. FFN ggml_tensor * ffn; { ggml_tensor * t = ggml_reshape_2d(ctx0, ca_out, n_embd, query_len * n_windows); t = build_mm(pl.ff_up_w, t); if (pl.ff_up_b) t = ggml_add(ctx0, t, pl.ff_up_b); t = ggml_gelu_erf(ctx0, t); t = build_mm(pl.ff_down_w, t); if (pl.ff_down_b) t = ggml_add(ctx0, t, pl.ff_down_b); t = ggml_reshape_3d(ctx0, t, n_embd, query_len, n_windows); ffn = ggml_add(ctx0, t, ca_out); ffn = build_norm(ffn, pl.ln_2_w, pl.ln_2_b, NORM_TYPE_NORMAL, qformer_eps, bid); } cbx(ffn, "qformer_out"); // 7. _unwin back to raster ggml_tensor * unwinned; { ggml_tensor * flat = ggml_reshape_2d(ctx0, ffn, n_embd, query_len * n_windows); unwinned = gather(flat, "g4v_blk" + std::to_string(bid) + "_unwin_idx", new_side * new_side); } cbx(unwinned, "unwin"); // 8. out_linear ggml_tensor * out = build_mm(blk.qf_proj_linear_w, unwinned); if (blk.qf_proj_linear_b) out = ggml_add(ctx0, out, blk.qf_proj_linear_b); cbx(out, "out"); return out; } // --------------------------------------------------------------------------- // build() - top-level graph // --------------------------------------------------------------------------- // Build the K-tiled, base-scaled newline row tensor. // Shape: (n_mmproj_embd, 1) ggml_tensor * clip_graph_granite4_vision::build_newline_row(ggml_context * ctx0) { const int K = (int) model.qf_proj_blocks.size(); GGML_ASSERT(K > 0); GGML_ASSERT(n_mmproj_embd % K == 0); const int projection_dim = n_mmproj_embd / K; GGML_ASSERT(model.image_newline != nullptr); GGML_ASSERT(ggml_nelements(model.image_newline) == projection_dim); // Build newline_row[k*projection_dim + d] = nl[d] * (k == 0 ? base : 1.0) ggml_tensor * nl = model.image_newline; // (projection_dim,) ggml_tensor * nl_first_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1); ggml_tensor * nl_row_2d; if (K == 1) { nl_row_2d = nl_first_2d; } else { ggml_tensor * nl_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1); ggml_tensor * rest_template = ggml_new_tensor_2d( ctx0, GGML_TYPE_F32, projection_dim, K - 1); ggml_tensor * nl_rest = ggml_repeat(ctx0, nl_2d, rest_template); nl_row_2d = ggml_concat(ctx0, nl_first_2d, nl_rest, 1); // (projection_dim, K) } nl_row_2d = ggml_cont(ctx0, nl_row_2d); return ggml_reshape_2d(ctx0, nl_row_2d, n_mmproj_embd, 1); } // Append a single newline row at the end of the tile output. ggml_tensor * clip_graph_granite4_vision::append_rowwise_newlines(ggml_context * ctx0, ggml_tensor * tile_output) { // For the single-tile case, append one newline row at the end. // For the multi-tile rowwise case, this will be called per-tile // (though currently only the single-tile path uses it). ggml_tensor * nl_row = build_newline_row(ctx0); return ggml_concat(ctx0, tile_output, nl_row, 1); } ggml_cgraph * clip_graph_granite4_vision::build() { GGML_ASSERT(model.patch_embeddings_0 != nullptr); GGML_ASSERT(model.position_embeddings != nullptr); GGML_ASSERT(model.class_embedding == nullptr); GGML_ASSERT(!model.qf_proj_blocks.empty()); // --- Stage 1a: SigLIP encoder producing intermediate hidden states --- ggml_tensor * inp = build_inp(); inp = ggml_add(ctx0, inp, model.position_embeddings); cb(inp, "pos_embed", -1); ggml_tensor * inpL = inp; std::vector layer_outs(n_layer, nullptr); for (int il = 0; il < n_layer; ++il) { const auto & layer = model.layers[il]; ggml_tensor * cur = inpL; cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); // Self-attention ggml_tensor * Qcur = build_mm(layer.q_w, cur); if (layer.q_b) Qcur = ggml_add(ctx0, Qcur, layer.q_b); ggml_tensor * Kcur = build_mm(layer.k_w, cur); if (layer.k_b) Kcur = ggml_add(ctx0, Kcur, layer.k_b); ggml_tensor * Vcur = build_mm(layer.v_w, cur); if (layer.v_b) Vcur = ggml_add(ctx0, Vcur, layer.v_b); Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); cur = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, nullptr, kq_scale, il); cur = ggml_add(ctx0, cur, inpL); inpL = cur; cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, layer.ff_gate_w, layer.ff_gate_b, layer.ff_down_w, layer.ff_down_b, hparams.ffn_op, il); cur = ggml_add(ctx0, inpL, cur); cb(cur, "layer_out", il); layer_outs[il] = cur; inpL = cur; } // --- Stage 1b/1c: WindowQFormer blocks --- const int projector_count = hparams.feature_layers.size(); const float qformer_eps = 1e-12f; ggml_tensor * mmproj = nullptr; for (int bid = 0; bid < projector_count; ++bid) { const auto & blk = model.qf_proj_blocks[bid]; int vlayer = hparams.feature_layers[bid]; GGML_ASSERT(vlayer >= 0 && vlayer < n_layer); ggml_tensor * h = layer_outs[vlayer]; ggml_tensor * stream = build_block( blk, h, bid, hparams.proj_spatial_offsets[bid], n_patches_x, hparams.downsample_window_side, hparams.downsample_query_side, qformer_eps); cb(stream, (std::string("proj_") + std::to_string(bid) + std::string("_v_out")).c_str(), vlayer); mmproj = mmproj ? ggml_concat(ctx0, mmproj, stream, 0) : stream; } // --- Stage 1d: Append newline tokens if add_newline is set --- if (add_newline) { mmproj = append_rowwise_newlines(ctx0, mmproj); ggml_set_name(mmproj, "g4v_mmproj_out_nl"); } else { ggml_set_name(mmproj, "g4v_mmproj_out"); } ggml_build_forward_expand(gf, mmproj); return gf; }