diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index e0d731bc04..e54f95e3b8 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -261,7 +261,7 @@ server_models::server_models( ggml_backend_dev_memory(dev, &free, &total); if (total > 0) { const uint64_t available = (free > memory_margin) ? free - memory_margin : 0; - memory_per_device[dev] = available; + available_memory_per_device[dev] = available; SRV_DBG("device %s: available memory after margin=%lu MB\n", ggml_backend_dev_name(dev), (unsigned long)(available / (1024 * 1024))); @@ -807,7 +807,7 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me model_memory_map total_memory_per_device; for (const auto & m : mapping) { if (m.second.meta.is_running()) { - for (const auto& [key, value] : m.second.meta.memory_per_device) { + for (const auto& [key, value] : m.second.meta.memory_usage_per_device) { total_memory_per_device[key] += value; } } @@ -820,7 +820,7 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me uint64_t memory_exceeded = 0; - for (const auto& [key, limit] : memory_per_device) { + for (const auto& [key, limit] : available_memory_per_device) { const uint64_t total_memory = get(total_memory_per_device, key); const uint64_t new_memory = get(new_model_memory_per_device, key); SRV_DBG("device %s: total=%lu MB, new=%lu MB, limit=%lu MB\n", @@ -838,7 +838,7 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me } void server_models::unload_lru(const model_memory_map& new_model_memory_per_device) { - const bool check_memory = base_params.models_memory_margin > 0 && !memory_per_device.empty(); + const bool check_memory = base_params.models_memory_margin > 0 && !available_memory_per_device.empty(); if (base_params.models_max <= 0 && !check_memory) { return; // no limit @@ -948,11 +948,11 @@ void server_models::load(const std::string & name) { if (base_params.models_memory_margin > 0) { std::lock_guard lk(mutex); auto & meta = mapping[name].meta; - if (meta.memory_per_device.empty()) { - meta.memory_per_device = get_model_memory_per_device(meta.preset); + if (meta.memory_usage_per_device.empty()) { + meta.memory_usage_per_device = get_model_memory_per_device(meta.preset); } - new_model_memory_per_device = meta.memory_per_device; + new_model_memory_per_device = meta.memory_usage_per_device; } unload_lru(new_model_memory_per_device); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 209bd46b5c..73148eb81c 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -72,7 +72,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading - model_memory_map memory_per_device; // projected bytes per device + model_memory_map memory_usage_per_device; // bytes used per device std::vector args; // args passed to the model instance, will be populated by render_args() json loaded_info; // info to be reflected via /v1/models endpoint ; if in DOWNLOADING state, it should contain download progress info int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) @@ -132,7 +132,7 @@ private: common_preset base_preset; // base preset from llama-server CLI args // available memory per device - std::map memory_per_device; + std::map available_memory_per_device; void update_meta(const std::string & name, const server_model_meta & meta);