mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-28 15:20:20 +00:00
354ebac8cb
* server: real-time reasoning interruption via control endpoint Builds on the manual reasoning budget trigger from #23949. Adds a CONTROL task that mirrors the CANCEL path on the live slot and calls common_sampler_reasoning_budget_force to end thinking mid-generation. POST /v1/chat/completions/control with { id_slot, action }, opt-in reasoning_control arms the budget sampler on demand. Router and single model. Minimal WebUI button as a skeleton for further UI work. * ui: track reasoning phase via explicit streaming state Add isReasoning to the chat store, mirroring the isLoading pattern: per conversation map, private setter, public accessor and reactive export. Set from the stream callbacks, true on reasoning chunks, false on the first content chunk, reset on stream end and resynced on conversation switch. The skip button now keys off isReasoning so it shows only during the thinking phase, not the whole generation. * ui: extract control endpoint and action into constants Move the chat completion routes, the slots route and the reasoning control action out of chat.service into api-endpoints and a dedicated control-actions module. No behavior change, drops the magic strings so the control protocol has a single source of truth. * server: target reasoning control by completion id Address @ngxson review on the control endpoint. Switch from id_slot to the chat completion id to avoid a TOCTOU: the slot can be reassigned between the lookup and the control request, so matching the live completion (oaicompat_cmpl_id) is safe and a finished one simply matches nothing. Rename the action to reasoning_end, guard it on the reasoning_control flag of the target slot, and reduce the response to {success} with an optional message. * ui: target reasoning control by completion id Keep the streamed completion id on the message and post it back to the control endpoint instead of probing /slots. Drops the slot discovery and the TOCTOU that came with it. Action renamed to reasoning_end, response read as {success}. * server: address review from @ngxson Move the control fields into task_params and drop the redundant comments on the control path. * server: document the reasoning control endpoint * Update tools/ui/src/lib/types/database.d.ts Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com> * ui: rename cmplId to completionId Per @allozaur review, clearer name for the streamed completion id. * ui: wire completion id capture through the agentic flow The webui streams through the agentic flow, which relayed onModel but not onCompletionId, so the completion id never reached the message and the control request was never sent. Relay it through the flow and its callbacks type, declare id on the chunk type, and log an explicit error when the button fires without a usable id. * ui: target reasoning control model from the message The model is a property of the completion, so read it from the streaming message like the id, not from the model dropdown which is unrelated UI state. Makes the request self-consistent by construction instead of just unlikely to drift. --------- Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>
153 lines
5.3 KiB
C++
153 lines
5.3 KiB
C++
#pragma once
|
|
|
|
#include "server-http.h"
|
|
#include "server-task.h"
|
|
#include "server-queue.h"
|
|
|
|
#include <nlohmann/json_fwd.hpp>
|
|
|
|
#include <cstddef>
|
|
#include <memory>
|
|
#include <set>
|
|
|
|
struct server_context_impl; // private implementation
|
|
|
|
struct server_context_meta {
|
|
std::string build_info;
|
|
std::string model_name;
|
|
std::set<std::string> model_aliases;
|
|
std::set<std::string> model_tags;
|
|
std::string model_path;
|
|
bool has_mtmd;
|
|
bool has_inp_image;
|
|
bool has_inp_audio;
|
|
json json_ui_settings; // Primary: new name
|
|
json json_webui_settings; // Deprecated: use json_ui_settings instead (kept for backward compat)
|
|
int slot_n_ctx;
|
|
enum llama_pooling_type pooling_type;
|
|
|
|
// chat params
|
|
server_chat_params & chat_params;
|
|
std::map<std::string, bool> chat_template_caps;
|
|
|
|
// tokens
|
|
std::string bos_token_str;
|
|
std::string eos_token_str;
|
|
llama_token fim_pre_token;
|
|
llama_token fim_sub_token;
|
|
llama_token fim_mid_token;
|
|
llama_token fim_pad_token;
|
|
llama_token fim_rep_token;
|
|
llama_token fim_sep_token;
|
|
|
|
// sampling
|
|
std::vector<llama_logit_bias> logit_bias_eog;
|
|
|
|
// model meta
|
|
enum llama_vocab_type model_vocab_type;
|
|
int32_t model_vocab_n_tokens;
|
|
int32_t model_n_ctx_train;
|
|
int32_t model_n_embd_inp;
|
|
uint64_t model_n_params;
|
|
uint64_t model_size;
|
|
};
|
|
|
|
struct server_context {
|
|
std::unique_ptr<server_context_impl> impl;
|
|
|
|
server_context();
|
|
~server_context();
|
|
|
|
// load the model and initialize llama_context
|
|
// returns true on success
|
|
bool load_model(common_params & params);
|
|
|
|
// this function will block main thread until termination
|
|
void start_loop();
|
|
|
|
// terminate main loop (will unblock start_loop)
|
|
void terminate();
|
|
|
|
// get the underlaying llama_context, can return nullptr if sleeping
|
|
// not thread-safe, should only be used from the main thread
|
|
llama_context * get_llama_context() const;
|
|
|
|
// get a new response reader, used by CLI application
|
|
server_response_reader get_response_reader();
|
|
|
|
// get server metadata (read-only), can only be called after load_model()
|
|
// not thread-safe, should only be used from the main thread
|
|
server_context_meta get_meta() const;
|
|
|
|
// register a callback to be called when sleeping state changes
|
|
// must be set before load_model() is called
|
|
void on_sleeping_changed(std::function<void(bool)> callback);
|
|
};
|
|
|
|
|
|
// forward declarations
|
|
struct server_res_generator;
|
|
|
|
struct server_routes {
|
|
server_routes(const common_params & params, server_context & ctx_server);
|
|
|
|
void init_routes();
|
|
|
|
// note: this is not thread-safe and can only when ctx_http.is_ready is false
|
|
void update_meta(const server_context & ctx_server) {
|
|
this->meta = std::make_unique<server_context_meta>(ctx_server.get_meta());
|
|
}
|
|
|
|
// handlers using lambda function, so that they can capture `this` without `std::bind`
|
|
// they won't be called until ctx_http.is_ready is set to true
|
|
server_http_context::handler_t get_health;
|
|
server_http_context::handler_t get_metrics;
|
|
server_http_context::handler_t get_slots;
|
|
server_http_context::handler_t post_slots;
|
|
server_http_context::handler_t get_props;
|
|
server_http_context::handler_t post_props;
|
|
server_http_context::handler_t post_infill;
|
|
server_http_context::handler_t post_completions;
|
|
server_http_context::handler_t post_completions_oai;
|
|
server_http_context::handler_t post_chat_completions;
|
|
server_http_context::handler_t post_control;
|
|
server_http_context::handler_t post_responses_oai;
|
|
server_http_context::handler_t post_transcriptions_oai;
|
|
server_http_context::handler_t post_anthropic_messages;
|
|
server_http_context::handler_t post_anthropic_count_tokens;
|
|
server_http_context::handler_t post_apply_template;
|
|
server_http_context::handler_t get_models;
|
|
server_http_context::handler_t post_tokenize;
|
|
server_http_context::handler_t post_detokenize;
|
|
server_http_context::handler_t post_embeddings;
|
|
server_http_context::handler_t post_embeddings_oai;
|
|
server_http_context::handler_t post_rerank;
|
|
server_http_context::handler_t get_lora_adapters;
|
|
server_http_context::handler_t post_lora_adapters;
|
|
|
|
// to be used in router mode
|
|
json get_model_info() const;
|
|
|
|
private:
|
|
std::unique_ptr<server_res_generator> handle_completions_impl(
|
|
const server_http_req & req,
|
|
server_task_type type,
|
|
const json & data,
|
|
const std::vector<raw_buffer> & files,
|
|
task_response_type res_type);
|
|
std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
|
|
std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
|
|
std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
|
|
std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
|
|
|
|
// using unique_ptr to allow late initialization of const
|
|
std::unique_ptr<const server_context_meta> meta;
|
|
|
|
const common_params & params;
|
|
const server_context_impl & ctx_server;
|
|
|
|
server_queue & queue_tasks;
|
|
server_response & queue_results;
|
|
std::unique_ptr<server_res_generator> create_response(bool bypass_sleep = false);
|
|
};
|