diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 38ff11d762..fa3ef0e9cf 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2099,7 +2099,7 @@ struct ggml_backend_vk_context { bool do_add_rms_partials_offset_calculation; bool do_add_rms_partials; - uint64_t last_total_mul_mat_bytes {}; + uint64_t last_total_flops {UINT64_MAX}; // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert. vk_pipeline_struct * prealloc_y_last_pipeline_used {}; @@ -16198,22 +16198,23 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg } // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution. - // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB - // (and scaled down based on model size, so smaller models submit earlier). + // Estimate the amount of compute work using flops, and submit every 200 GFLOP + // (and scaled down based on total graph flops, so smaller models submit earlier). + // Also submit at least every 100 nodes, in case there are workloads without heavy compute. int submitted_nodes = 0; int submit_count = 0; - uint64_t mul_mat_bytes = 0; - uint64_t total_mul_mat_bytes = 0; - uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), ctx->last_total_mul_mat_bytes / 40u); + uint64_t batch_flops = 0; + uint64_t total_flops = 0; + uint64_t flops_per_submit = std::min(uint64_t(200'000'000'000), ctx->last_total_flops / 40u); for (int i = 0; i < cgraph->n_nodes; i++) { if (first_node_in_batch) { submit_node_idx = i; } - if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { - auto bytes = ggml_nbytes(cgraph->nodes[i]->src[0]); - mul_mat_bytes += bytes; - total_mul_mat_bytes += bytes; + { + auto node_flops = ggml_vk_get_node_flops(cgraph->nodes[i]); + batch_flops += node_flops; + total_flops += node_flops; } // op_srcs_fused_elementwise indicates whether an op's srcs all contribute to @@ -16425,8 +16426,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining) bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5; - bool submit = ((uint32_t)submitted_nodes >= ctx->device->max_nodes_per_submit) || - (mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) || + bool submit = (submitted_nodes >= ctx->device->max_nodes_per_submit) || + (flops_per_submit != 0 && batch_flops >= flops_per_submit) || (i + ctx->num_additional_fused_ops >= last_node) || (almost_ready && !ctx->almost_ready_fence_pending); @@ -16460,9 +16461,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg if (submit && enqueued) { first_node_in_batch = true; submitted_nodes = 0; - mul_mat_bytes = 0; + batch_flops = 0; if (submit_count < 3) { - mul_mat_bytes_per_submit *= 2; + flops_per_submit *= 2; } submit_count++; } @@ -16471,7 +16472,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ctx->fused_ops_write_mask = 0; } - ctx->last_total_mul_mat_bytes = total_mul_mat_bytes; + ctx->last_total_flops = total_flops; if (vk_perf_logger_enabled) { // End the command buffer and submit/wait