mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-26 06:10:19 +00:00
use flops instead of matmul src0 tensor size for submission threshold
This commit is contained in:
@@ -2099,7 +2099,7 @@ struct ggml_backend_vk_context {
|
||||
bool do_add_rms_partials_offset_calculation;
|
||||
bool do_add_rms_partials;
|
||||
|
||||
uint64_t last_total_mul_mat_bytes {};
|
||||
uint64_t last_total_flops {UINT64_MAX};
|
||||
|
||||
// Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
|
||||
vk_pipeline_struct * prealloc_y_last_pipeline_used {};
|
||||
@@ -16198,22 +16198,23 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
}
|
||||
|
||||
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
|
||||
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
|
||||
// (and scaled down based on model size, so smaller models submit earlier).
|
||||
// Estimate the amount of compute work using flops, and submit every 200 GFLOP
|
||||
// (and scaled down based on total graph flops, so smaller models submit earlier).
|
||||
// Also submit at least every 100 nodes, in case there are workloads without heavy compute.
|
||||
int submitted_nodes = 0;
|
||||
int submit_count = 0;
|
||||
uint64_t mul_mat_bytes = 0;
|
||||
uint64_t total_mul_mat_bytes = 0;
|
||||
uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), ctx->last_total_mul_mat_bytes / 40u);
|
||||
uint64_t batch_flops = 0;
|
||||
uint64_t total_flops = 0;
|
||||
uint64_t flops_per_submit = std::min(uint64_t(200'000'000'000), ctx->last_total_flops / 40u);
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
if (first_node_in_batch) {
|
||||
submit_node_idx = i;
|
||||
}
|
||||
|
||||
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
|
||||
auto bytes = ggml_nbytes(cgraph->nodes[i]->src[0]);
|
||||
mul_mat_bytes += bytes;
|
||||
total_mul_mat_bytes += bytes;
|
||||
{
|
||||
auto node_flops = ggml_vk_get_node_flops(cgraph->nodes[i]);
|
||||
batch_flops += node_flops;
|
||||
total_flops += node_flops;
|
||||
}
|
||||
|
||||
// op_srcs_fused_elementwise indicates whether an op's srcs all contribute to
|
||||
@@ -16425,8 +16426,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
|
||||
// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
|
||||
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
|
||||
bool submit = ((uint32_t)submitted_nodes >= ctx->device->max_nodes_per_submit) ||
|
||||
(mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) ||
|
||||
bool submit = (submitted_nodes >= ctx->device->max_nodes_per_submit) ||
|
||||
(flops_per_submit != 0 && batch_flops >= flops_per_submit) ||
|
||||
(i + ctx->num_additional_fused_ops >= last_node) ||
|
||||
(almost_ready && !ctx->almost_ready_fence_pending);
|
||||
|
||||
@@ -16460,9 +16461,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
if (submit && enqueued) {
|
||||
first_node_in_batch = true;
|
||||
submitted_nodes = 0;
|
||||
mul_mat_bytes = 0;
|
||||
batch_flops = 0;
|
||||
if (submit_count < 3) {
|
||||
mul_mat_bytes_per_submit *= 2;
|
||||
flops_per_submit *= 2;
|
||||
}
|
||||
submit_count++;
|
||||
}
|
||||
@@ -16471,7 +16472,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
ctx->fused_ops_write_mask = 0;
|
||||
}
|
||||
|
||||
ctx->last_total_mul_mat_bytes = total_mul_mat_bytes;
|
||||
ctx->last_total_flops = total_flops;
|
||||
|
||||
if (vk_perf_logger_enabled) {
|
||||
// End the command buffer and submit/wait
|
||||
|
||||
Reference in New Issue
Block a user