use flops instead of matmul src0 tensor size for submission threshold

This commit is contained in:
Ruben Ortlam
2026-06-24 15:24:56 +02:00
parent 492adff8fb
commit cb2a4259aa
+16 -15
View File
@@ -2099,7 +2099,7 @@ struct ggml_backend_vk_context {
bool do_add_rms_partials_offset_calculation;
bool do_add_rms_partials;
uint64_t last_total_mul_mat_bytes {};
uint64_t last_total_flops {UINT64_MAX};
// Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
vk_pipeline_struct * prealloc_y_last_pipeline_used {};
@@ -16198,22 +16198,23 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
}
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
// (and scaled down based on model size, so smaller models submit earlier).
// Estimate the amount of compute work using flops, and submit every 200 GFLOP
// (and scaled down based on total graph flops, so smaller models submit earlier).
// Also submit at least every 100 nodes, in case there are workloads without heavy compute.
int submitted_nodes = 0;
int submit_count = 0;
uint64_t mul_mat_bytes = 0;
uint64_t total_mul_mat_bytes = 0;
uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), ctx->last_total_mul_mat_bytes / 40u);
uint64_t batch_flops = 0;
uint64_t total_flops = 0;
uint64_t flops_per_submit = std::min(uint64_t(200'000'000'000), ctx->last_total_flops / 40u);
for (int i = 0; i < cgraph->n_nodes; i++) {
if (first_node_in_batch) {
submit_node_idx = i;
}
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
auto bytes = ggml_nbytes(cgraph->nodes[i]->src[0]);
mul_mat_bytes += bytes;
total_mul_mat_bytes += bytes;
{
auto node_flops = ggml_vk_get_node_flops(cgraph->nodes[i]);
batch_flops += node_flops;
total_flops += node_flops;
}
// op_srcs_fused_elementwise indicates whether an op's srcs all contribute to
@@ -16425,8 +16426,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
bool submit = ((uint32_t)submitted_nodes >= ctx->device->max_nodes_per_submit) ||
(mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) ||
bool submit = (submitted_nodes >= ctx->device->max_nodes_per_submit) ||
(flops_per_submit != 0 && batch_flops >= flops_per_submit) ||
(i + ctx->num_additional_fused_ops >= last_node) ||
(almost_ready && !ctx->almost_ready_fence_pending);
@@ -16460,9 +16461,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
if (submit && enqueued) {
first_node_in_batch = true;
submitted_nodes = 0;
mul_mat_bytes = 0;
batch_flops = 0;
if (submit_count < 3) {
mul_mat_bytes_per_submit *= 2;
flops_per_submit *= 2;
}
submit_count++;
}
@@ -16471,7 +16472,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->fused_ops_write_mask = 0;
}
ctx->last_total_mul_mat_bytes = total_mul_mat_bytes;
ctx->last_total_flops = total_flops;
if (vk_perf_logger_enabled) {
// End the command buffer and submit/wait