mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-26 06:10:19 +00:00
improve test
This commit is contained in:
+113
-31
@@ -24,6 +24,7 @@ struct bench_params {
|
||||
bool do_d2d = true;
|
||||
bool do_async = true;
|
||||
bool csv = false;
|
||||
int async_copies = 4;
|
||||
};
|
||||
|
||||
struct device_info {
|
||||
@@ -75,6 +76,7 @@ static void usage(const char * argv0) {
|
||||
fprintf(stderr, " --no-d2h skip device-to-host tests\n");
|
||||
fprintf(stderr, " --no-d2d skip device-to-device tests\n");
|
||||
fprintf(stderr, " --no-async skip async copy tests\n");
|
||||
fprintf(stderr, " --async-copies N number of pipelined async copies (default: 4)\n");
|
||||
fprintf(stderr, " --output FORMAT output format: table (default), csv\n");
|
||||
}
|
||||
|
||||
@@ -110,6 +112,10 @@ static bool parse_args(int argc, char ** argv, bench_params & params) {
|
||||
params.do_d2d = false;
|
||||
} else if (strcmp(argv[i], "--no-async") == 0) {
|
||||
params.do_async = false;
|
||||
} else if (strcmp(argv[i], "--async-copies") == 0) {
|
||||
if (++i >= argc) { usage(argv[0]); return false; }
|
||||
params.async_copies = atoi(argv[i]);
|
||||
if (params.async_copies < 1) { fprintf(stderr, "invalid async-copies: %s\n", argv[i]); return false; }
|
||||
} else if (strcmp(argv[i], "--output") == 0) {
|
||||
if (++i >= argc) { usage(argv[0]); return false; }
|
||||
if (strcmp(argv[i], "csv") == 0) {
|
||||
@@ -227,6 +233,35 @@ struct tensor_on_device {
|
||||
ggml_tensor * tensor;
|
||||
};
|
||||
|
||||
static std::vector<tensor_on_device> create_tensors_on_buffer(ggml_backend_buffer_t buffer, size_t size_bytes, int count, const char * prefix) {
|
||||
std::vector<tensor_on_device> result;
|
||||
struct ggml_tallocr talloc = ggml_tallocr_new(buffer);
|
||||
|
||||
for (int i = 0; i < count; i++) {
|
||||
int64_t n_elements = (int64_t)(size_bytes / sizeof(float));
|
||||
if (n_elements == 0) {
|
||||
n_elements = 1;
|
||||
}
|
||||
|
||||
ggml_init_params init_params = {
|
||||
/* .mem_size = */ ggml_tensor_overhead() + 64,
|
||||
/* .mem_buffer = */ nullptr,
|
||||
/* .no_alloc = */ true,
|
||||
};
|
||||
ggml_context_ptr ctx(ggml_init(init_params));
|
||||
if (!ctx) { break; }
|
||||
|
||||
ggml_tensor * tensor = ggml_new_tensor_1d(ctx.get(), GGML_TYPE_F32, n_elements);
|
||||
char tname[128];
|
||||
snprintf(tname, sizeof(tname), "%s_%d", prefix, i);
|
||||
ggml_set_name(tensor, tname);
|
||||
ggml_tallocr_alloc(&talloc, tensor);
|
||||
|
||||
result.push_back({ std::move(ctx), tensor });
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static tensor_on_device create_tensor_on_buffer(ggml_backend_buffer_t buffer, size_t size_bytes, const char * name) {
|
||||
int64_t n_elements = (int64_t)(size_bytes / sizeof(float));
|
||||
if (n_elements == 0) {
|
||||
@@ -329,10 +364,11 @@ int main(int argc, char ** argv) {
|
||||
printf("No GPU devices found. Only CPU-to-CPU results available.\n\n");
|
||||
}
|
||||
|
||||
// Allocate buffers on each device (sized for the largest test)
|
||||
// Allocate buffers on each device (sized for the largest test * async copies)
|
||||
size_t max_size = params.sizes.back();
|
||||
int n_buf_tensors = (params.do_async && params.do_d2d) ? params.async_copies : 1;
|
||||
for (auto & dev : devices) {
|
||||
size_t alloc_size = max_size + 4096;
|
||||
size_t alloc_size = (size_t)n_buf_tensors * max_size + 4096;
|
||||
if (dev.mem_total > 0 && alloc_size > dev.mem_free) {
|
||||
fprintf(stderr, "warning: max test size (%s) exceeds free memory on %s (%zu MB), will skip large sizes\n",
|
||||
format_size(max_size).c_str(), dev.name.c_str(), dev.mem_free / (1024 * 1024));
|
||||
@@ -392,17 +428,46 @@ int main(int argc, char ** argv) {
|
||||
return nullptr;
|
||||
};
|
||||
|
||||
// Correctness verification: set/get round-trip on each device
|
||||
std::vector<uint8_t> readback(size);
|
||||
for (auto & t : tensors) {
|
||||
ggml_backend_tensor_set(t.td.tensor, host_data.data(), 0, size);
|
||||
ggml_backend_tensor_get(t.td.tensor, readback.data(), 0, size);
|
||||
if (memcmp(host_data.data(), readback.data(), size) != 0) {
|
||||
fprintf(stderr, "FAIL: set/get round-trip on %s for size %s\n",
|
||||
devices[t.dev_idx].name.c_str(), format_size(size).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// Correctness verification: D2D copy
|
||||
if (params.do_d2d) {
|
||||
for (size_t si = 0; si < devices.size(); si++) {
|
||||
for (size_t di = 0; di < devices.size(); di++) {
|
||||
if (si == di) { continue; }
|
||||
ggml_tensor * src = find_tensor(si);
|
||||
ggml_tensor * dst = find_tensor(di);
|
||||
if (!src || !dst) { continue; }
|
||||
|
||||
ggml_backend_tensor_copy(src, dst);
|
||||
ggml_backend_tensor_get(dst, readback.data(), 0, size);
|
||||
if (memcmp(host_data.data(), readback.data(), size) != 0) {
|
||||
fprintf(stderr, "FAIL: D2D copy %s -> %s for size %s\n",
|
||||
devices[si].name.c_str(), devices[di].name.c_str(), format_size(size).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Host-to-Device
|
||||
if (params.do_h2d) {
|
||||
for (size_t gi : gpu_indices) {
|
||||
ggml_tensor * dst = find_tensor(gi);
|
||||
if (!dst) { continue; }
|
||||
ggml_backend_t be = devices[gi].backend.get();
|
||||
|
||||
auto r = benchmark_copy("Host", devices[gi].name.c_str(), "set", size, params,
|
||||
[&]{ ggml_backend_synchronize(be); },
|
||||
[&]{ },
|
||||
[&]{ ggml_backend_tensor_set(dst, host_data.data(), 0, size); },
|
||||
[&]{ ggml_backend_synchronize(be); });
|
||||
[&]{ });
|
||||
|
||||
params.csv ? print_csv_row(r) : print_table_row(r);
|
||||
}
|
||||
@@ -413,12 +478,11 @@ int main(int argc, char ** argv) {
|
||||
for (size_t gi : gpu_indices) {
|
||||
ggml_tensor * src = find_tensor(gi);
|
||||
if (!src) { continue; }
|
||||
ggml_backend_t be = devices[gi].backend.get();
|
||||
|
||||
auto r = benchmark_copy(devices[gi].name.c_str(), "Host", "get", size, params,
|
||||
[&]{ ggml_backend_synchronize(be); },
|
||||
[&]{ },
|
||||
[&]{ ggml_backend_tensor_get(src, host_data.data(), 0, size); },
|
||||
[&]{ ggml_backend_synchronize(be); });
|
||||
[&]{ });
|
||||
|
||||
params.csv ? print_csv_row(r) : print_table_row(r);
|
||||
}
|
||||
@@ -434,50 +498,68 @@ int main(int argc, char ** argv) {
|
||||
ggml_tensor * dst = find_tensor(di);
|
||||
if (!src || !dst) { continue; }
|
||||
|
||||
ggml_backend_t be_src = devices[si].backend.get();
|
||||
ggml_backend_t be_dst = devices[di].backend.get();
|
||||
|
||||
auto r = benchmark_copy(
|
||||
devices[si].name.c_str(), devices[di].name.c_str(), "copy_sync", size, params,
|
||||
[&]{
|
||||
ggml_backend_synchronize(be_src);
|
||||
ggml_backend_synchronize(be_dst);
|
||||
},
|
||||
[&]{ },
|
||||
[&]{ ggml_backend_tensor_copy(src, dst); },
|
||||
[&]{
|
||||
ggml_backend_synchronize(be_src);
|
||||
ggml_backend_synchronize(be_dst);
|
||||
});
|
||||
[&]{ });
|
||||
|
||||
params.csv ? print_csv_row(r) : print_table_row(r);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Device-to-Device (async)
|
||||
// Device-to-Device (async, pipelined)
|
||||
if (params.do_d2d && params.do_async) {
|
||||
int n = params.async_copies;
|
||||
|
||||
struct dev_async_tensors {
|
||||
size_t dev_idx;
|
||||
std::vector<tensor_on_device> tds;
|
||||
};
|
||||
std::vector<dev_async_tensors> async_tensors;
|
||||
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
if (!devices[i].buffer) { continue; }
|
||||
if (devices[i].mem_total > 0 && (size_t)n * size + 4096 > devices[i].mem_free) { continue; }
|
||||
char prefix[64];
|
||||
snprintf(prefix, sizeof(prefix), "%s_async_%s", devices[i].name.c_str(), format_size(size).c_str());
|
||||
auto tds = create_tensors_on_buffer(devices[i].buffer.get(), size, n, prefix);
|
||||
if ((int)tds.size() < n) { continue; }
|
||||
async_tensors.push_back({ i, std::move(tds) });
|
||||
}
|
||||
|
||||
auto find_async = [&](size_t dev_idx) -> std::vector<tensor_on_device> * {
|
||||
for (auto & at : async_tensors) {
|
||||
if (at.dev_idx == dev_idx) { return &at.tds; }
|
||||
}
|
||||
return nullptr;
|
||||
};
|
||||
|
||||
char method[32];
|
||||
snprintf(method, sizeof(method), "async(%d)", n);
|
||||
|
||||
for (size_t si = 0; si < devices.size(); si++) {
|
||||
for (size_t di = 0; di < devices.size(); di++) {
|
||||
if (si == di) { continue; }
|
||||
|
||||
ggml_tensor * src = find_tensor(si);
|
||||
ggml_tensor * dst = find_tensor(di);
|
||||
if (!src || !dst) { continue; }
|
||||
auto * srcs = find_async(si);
|
||||
auto * dsts = find_async(di);
|
||||
if (!srcs || !dsts) { continue; }
|
||||
|
||||
ggml_backend_t be_src = devices[si].backend.get();
|
||||
ggml_backend_t be_dst = devices[di].backend.get();
|
||||
|
||||
auto r = benchmark_copy(
|
||||
devices[si].name.c_str(), devices[di].name.c_str(), "copy_async", size, params,
|
||||
devices[si].name.c_str(), devices[di].name.c_str(), method,
|
||||
(size_t)n * size, params,
|
||||
[&]{ },
|
||||
[&]{
|
||||
ggml_backend_synchronize(be_src);
|
||||
ggml_backend_synchronize(be_dst);
|
||||
for (int k = 0; k < n; k++) {
|
||||
ggml_backend_tensor_copy_async(be_src, be_dst, (*srcs)[k].tensor, (*dsts)[k].tensor);
|
||||
}
|
||||
},
|
||||
[&]{ ggml_backend_tensor_copy_async(be_src, be_dst, src, dst); },
|
||||
[&]{
|
||||
ggml_backend_synchronize(be_src);
|
||||
ggml_backend_synchronize(be_dst);
|
||||
});
|
||||
[&]{ ggml_backend_synchronize(be_dst); });
|
||||
|
||||
params.csv ? print_csv_row(r) : print_table_row(r);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user