From aa27b85ecf00de2ca6f4ea87e05bafbd18d5fb43 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 May 2026 18:40:49 +0300
Subject: [PATCH] metal : optimize pad

---
 ggml/src/ggml-metal/ggml-metal-ops.cpp |  6 ++++--
 ggml/src/ggml-metal/ggml-metal.metal   | 10 ++++++----
 src/models/delta-net-base.cpp          |  9 +++++----
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index a114391c2e..c388bd6036 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -4039,14 +4039,16 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
 
     auto pipeline = ggml_metal_library_get_pipeline_pad(lib, op);
 
-    const int nth = std::min(1024, ne0);
+    const int nth_max = MIN(1024, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    const int nth = MIN(args.ne0, nth_max);
+    const int nk0 = (args.ne0 + nth - 1)/nth;
 
     ggml_metal_encoder_set_pipeline(enc, pipeline);
     ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
 
-    ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
+    ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne1, ne2, ne3, nth, 1, 1);
 
     return 1;
 }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index f6ffb2b3a1..c678cc6e63 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -5336,10 +5336,10 @@ kernel void kernel_pad_f32(
     uint3 tgpig[[threadgroup_position_in_grid]],
     uint3 tpitg[[thread_position_in_threadgroup]],
     uint3   ntg[[threads_per_threadgroup]]) {
-
     const int64_t i3 = tgpig.z;
     const int64_t i2 = tgpig.y;
-    const int64_t i1 = tgpig.x;
+    const int64_t k0 = tgpig.x/args.ne1;
+    const int64_t i1 = tgpig.x - k0*args.ne1;
 
     const int64_t i03 = i3;
     const int64_t i02 = i2;
@@ -5348,8 +5348,10 @@ kernel void kernel_pad_f32(
     device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
     device       float * dst_ptr  = (device       float *) (dst  +  i3*args.nb3  +  i2*args.nb2  +  i1*args.nb1);
 
+    const int64_t i0 = k0*ntg.x + tpitg.x;
+
     if (i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
-        for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        if (i0 < args.ne0) {
             if (i0 < args.ne00) {
                 dst_ptr[i0] = src0_ptr[i0];
             } else {
@@ -5360,7 +5362,7 @@ kernel void kernel_pad_f32(
         return;
     }
 
-    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+    if (i0 < args.ne0) {
         dst_ptr[i0] = 0.0f;
     }
 }
diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp
index a67238383e..1d3428fbe2 100644
--- a/src/models/delta-net-base.cpp
+++ b/src/models/delta-net-base.cpp
@@ -562,13 +562,14 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
     }
 
     const int64_t D = S_v * S_v * H_v;
-    const int64_t K = (int64_t) cparams.n_rs_seq + 1;
+    const int64_t K = cparams.n_rs_seq + 1;
 
     // TODO: remove pad + simplify
-    ggml_tensor * state_in_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs);
-    ggml_tensor * state_3d    = ggml_pad(ctx0, state_in_3d, 0, K - 1, 0, 0);
+    ggml_tensor * s_4d     = ggml_reshape_4d(ctx0, s, S_v, S_v*H_v, 1, n_seqs);
+    ggml_tensor * s_4d_pad = ggml_pad       (ctx0, s_4d, 0, 0, K - 1, 0);
+    ggml_tensor * s_3d     = ggml_reshape_3d(ctx0, s_4d_pad, D, K, n_seqs);
 
-    ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, state_3d);
+    ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d);
     if (n_seq_tokens > 1) {
         cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il);
     } else {