Files
llama.cpp/tools/server/server-task.cpp
T
Pascal 354ebac8cb server: real-time reasoning interruption via control endpoint (#23971)
* server: real-time reasoning interruption via control endpoint

Builds on the manual reasoning budget trigger from #23949. Adds a
CONTROL task that mirrors the CANCEL path on the live slot and calls
common_sampler_reasoning_budget_force to end thinking mid-generation.
POST /v1/chat/completions/control with { id_slot, action }, opt-in
reasoning_control arms the budget sampler on demand. Router and single
model. Minimal WebUI button as a skeleton for further UI work.

* ui: track reasoning phase via explicit streaming state

Add isReasoning to the chat store, mirroring the isLoading pattern:
per conversation map, private setter, public accessor and reactive
export. Set from the stream callbacks, true on reasoning chunks, false
on the first content chunk, reset on stream end and resynced on
conversation switch. The skip button now keys off isReasoning so it
shows only during the thinking phase, not the whole generation.

* ui: extract control endpoint and action into constants

Move the chat completion routes, the slots route and the reasoning
control action out of chat.service into api-endpoints and a dedicated
control-actions module. No behavior change, drops the magic strings so
the control protocol has a single source of truth.

* server: target reasoning control by completion id

Address @ngxson review on the control endpoint.

Switch from id_slot to the chat completion id to avoid a TOCTOU: the
slot can be reassigned between the lookup and the control request, so
matching the live completion (oaicompat_cmpl_id) is safe and a finished
one simply matches nothing. Rename the action to reasoning_end, guard
it on the reasoning_control flag of the target slot, and reduce the
response to {success} with an optional message.

* ui: target reasoning control by completion id

Keep the streamed completion id on the message and post it back to the
control endpoint instead of probing /slots. Drops the slot discovery
and the TOCTOU that came with it. Action renamed to reasoning_end,
response read as {success}.

* server: address review from @ngxson

Move the control fields into task_params and drop the redundant
comments on the control path.

* server: document the reasoning control endpoint

* Update tools/ui/src/lib/types/database.d.ts

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>

* ui: rename cmplId to completionId

Per @allozaur review, clearer name for the streamed completion id.

* ui: wire completion id capture through the agentic flow

The webui streams through the agentic flow, which relayed onModel but
not onCompletionId, so the completion id never reached the message and
the control request was never sent. Relay it through the flow and its
callbacks type, declare id on the chunk type, and log an explicit error
when the button fires without a usable id.

* ui: target reasoning control model from the message

The model is a property of the completion, so read it from the streaming
message like the id, not from the model dropdown which is unrelated UI
state. Makes the request self-consistent by construction instead of just
unlikely to drift.

---------

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>
2026-06-02 07:26:20 +02:00

2176 lines
82 KiB
C++

#include "server-task.h"
#include "build-info.h"
#include "server-chat.h"
#include "chat.h"
#include "common.h"
#include "json-schema-to-grammar.h"
#include "llama.h"
#include "sampling.h"
#include "speculative.h"
#include "server-common.h"
using json = nlohmann::ordered_json;
//
// task_params
//
json task_params::format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const {
json data = json::array();
for (const auto & lb : logit_bias) {
data.push_back(json{
{"bias", lb.bias},
{"token", lb.token},
});
}
return data;
}
json task_params::to_json(bool only_metrics) const {
std::vector<std::string> samplers;
samplers.reserve(sampling.samplers.size());
for (const auto & sampler : sampling.samplers) {
samplers.emplace_back(common_sampler_type_to_str(sampler));
}
json lora = json::array();
for (auto & it : this->lora) {
lora.push_back({{"id", it.first}, {"scale", it.second}});
}
if (only_metrics) {
return json {
{"seed", sampling.seed},
{"temperature", sampling.temp},
{"dynatemp_range", sampling.dynatemp_range},
{"dynatemp_exponent", sampling.dynatemp_exponent},
{"top_k", sampling.top_k},
{"top_p", sampling.top_p},
{"min_p", sampling.min_p},
{"top_n_sigma", sampling.top_n_sigma},
{"xtc_probability", sampling.xtc_probability},
{"xtc_threshold", sampling.xtc_threshold},
{"typical_p", sampling.typ_p},
{"repeat_last_n", sampling.penalty_last_n},
{"repeat_penalty", sampling.penalty_repeat},
{"presence_penalty", sampling.penalty_present},
{"frequency_penalty", sampling.penalty_freq},
{"dry_multiplier", sampling.dry_multiplier},
{"dry_base", sampling.dry_base},
{"dry_allowed_length", sampling.dry_allowed_length},
{"dry_penalty_last_n", sampling.dry_penalty_last_n},
{"mirostat", sampling.mirostat},
{"mirostat_tau", sampling.mirostat_tau},
{"mirostat_eta", sampling.mirostat_eta},
{"max_tokens", n_predict},
{"n_predict", n_predict}, // TODO: deduplicate?
{"n_keep", n_keep},
{"n_discard", n_discard},
{"ignore_eos", sampling.ignore_eos},
{"stream", stream},
{"n_probs", sampling.n_probs},
{"min_keep", sampling.min_keep},
{"chat_format", common_chat_format_name(chat_parser_params.format)},
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
{"generation_prompt", chat_parser_params.generation_prompt},
{"samplers", samplers},
{"speculative.types", common_speculative_type_name_str(speculative.types)},
{"timings_per_token", timings_per_token},
{"post_sampling_probs", post_sampling_probs},
{"backend_sampling", sampling.backend_sampling},
{"lora", lora},
};
}
auto grammar_triggers = json::array();
for (const auto & trigger : sampling.grammar_triggers) {
server_grammar_trigger ct(trigger);
grammar_triggers.push_back(ct.to_json());
}
return json {
{"seed", sampling.seed},
{"temperature", sampling.temp},
{"dynatemp_range", sampling.dynatemp_range},
{"dynatemp_exponent", sampling.dynatemp_exponent},
{"top_k", sampling.top_k},
{"top_p", sampling.top_p},
{"min_p", sampling.min_p},
{"top_n_sigma", sampling.top_n_sigma},
{"xtc_probability", sampling.xtc_probability},
{"xtc_threshold", sampling.xtc_threshold},
{"typical_p", sampling.typ_p},
{"repeat_last_n", sampling.penalty_last_n},
{"repeat_penalty", sampling.penalty_repeat},
{"presence_penalty", sampling.penalty_present},
{"frequency_penalty", sampling.penalty_freq},
{"dry_multiplier", sampling.dry_multiplier},
{"dry_base", sampling.dry_base},
{"dry_allowed_length", sampling.dry_allowed_length},
{"dry_penalty_last_n", sampling.dry_penalty_last_n},
{"dry_sequence_breakers", sampling.dry_sequence_breakers},
{"mirostat", sampling.mirostat},
{"mirostat_tau", sampling.mirostat_tau},
{"mirostat_eta", sampling.mirostat_eta},
{"stop", antiprompt},
{"max_tokens", n_predict},
{"n_predict", n_predict}, // TODO: deduplicate?
{"n_keep", n_keep},
{"n_discard", n_discard},
{"ignore_eos", sampling.ignore_eos},
{"stream", stream},
{"logit_bias", format_logit_bias(sampling.logit_bias)},
{"n_probs", sampling.n_probs},
{"min_keep", sampling.min_keep},
{"grammar", common_grammar_value(sampling.grammar)},
{"grammar_lazy", sampling.grammar_lazy},
{"grammar_triggers", grammar_triggers},
{"preserved_tokens", sampling.preserved_tokens},
{"chat_format", common_chat_format_name(chat_parser_params.format)},
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
{"generation_prompt", chat_parser_params.generation_prompt},
{"samplers", samplers},
{"speculative.types", common_speculative_type_name_str(speculative.types)},
{"timings_per_token", timings_per_token},
{"post_sampling_probs", post_sampling_probs},
{"backend_sampling", sampling.backend_sampling},
{"lora", lora},
};
}
//
// task_result_state
//
task_result_state::task_result_state(const common_chat_parser_params & chat_parser_params)
: chat_parser_params(chat_parser_params)
, oai_resp_id("resp_" + random_string())
, oai_resp_reasoning_id("rs_" + random_string())
, oai_resp_message_id("msg_" + random_string()) {
if (chat_parser_params.is_continuation && !chat_parser_params.echo) {
// initialize chat_msg to avoid emitting a delta containing the assistant prefill
chat_msg = common_chat_parse("", true, chat_parser_params);
}
}
common_chat_msg task_result_state::update_chat_msg(
const std::string & text_added,
bool is_partial,
std::vector<common_chat_msg_diff> & diffs,
bool filter_tool_calls) {
generated_text += text_added;
auto msg_prv_copy = chat_msg;
//SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
auto new_msg = common_chat_parse(
generated_text,
is_partial,
chat_parser_params);
if (!new_msg.empty()) {
new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id);
chat_msg = new_msg;
auto all_diffs = common_chat_msg_diff::compute_diffs(msg_prv_copy, chat_msg);
if (!filter_tool_calls) {
diffs = std::move(all_diffs);
} else {
for (auto & d : all_diffs) {
// If this is a new type of delta, flush all currently pending tool call names
for (size_t i = 0; i < chat_msg.tool_calls.size(); ++i) {
if (sent_tool_call_names.count(i) || chat_msg.tool_calls[i].name.empty()) {
continue;
}
if (d.tool_call_index != i || !d.tool_call_delta.arguments.empty()) {
common_chat_msg_diff header;
header.tool_call_index = i;
header.tool_call_delta.id = chat_msg.tool_calls[i].id;
header.tool_call_delta.name = chat_msg.tool_calls[i].name;
diffs.push_back(std::move(header));
sent_tool_call_names.insert(i);
}
}
if (d.tool_call_index == std::string::npos) {
diffs.push_back(std::move(d));
} else {
size_t i = d.tool_call_index;
if (sent_tool_call_names.count(i)) {
if (!d.tool_call_delta.arguments.empty()) {
d.tool_call_delta.name = "";
d.tool_call_delta.id = "";
diffs.push_back(std::move(d));
}
} else {
// Not sent yet.
if (!d.tool_call_delta.arguments.empty() || !is_partial) {
d.tool_call_delta.name = chat_msg.tool_calls[i].name;
d.tool_call_delta.id = chat_msg.tool_calls[i].id;
diffs.push_back(std::move(d));
sent_tool_call_names.insert(i);
} else {
// Suppress
}
}
}
}
// Final check at EOF
if (!is_partial) {
for (size_t i = 0; i < chat_msg.tool_calls.size(); ++i) {
if (!sent_tool_call_names.count(i) && !chat_msg.tool_calls[i].name.empty()) {
common_chat_msg_diff header;
header.tool_call_index = i;
header.tool_call_delta.id = chat_msg.tool_calls[i].id;
header.tool_call_delta.name = chat_msg.tool_calls[i].name;
diffs.push_back(std::move(header));
sent_tool_call_names.insert(i);
}
}
}
}
}
return chat_msg;
}
//
// server_task
//
task_params server_task::params_from_json_cmpl(
const llama_vocab * vocab,
const common_params & params_base,
const int n_ctx_slot,
const std::vector<llama_logit_bias> & logit_bias_eog,
const json & data) {
task_params params;
// Sampling parameter defaults are loaded from the global server context (but individual requests can still them)
task_params defaults;
defaults.sampling = params_base.sampling;
defaults.speculative = params_base.speculative;
defaults.n_keep = params_base.n_keep;
defaults.n_predict = params_base.n_predict;
defaults.n_cache_reuse = params_base.n_cache_reuse;
defaults.cache_prompt = params_base.cache_prompt;
defaults.antiprompt = params_base.antiprompt;
// enabling this will output extra debug information in the HTTP responses from the server
params.verbose = params_base.verbosity > 9;
params.timings_per_token = json_value(data, "timings_per_token", false);
params.stream = json_value(data, "stream", false);
auto stream_opt = json_value(data, "stream_options", json::object());
params.include_usage = json_value(stream_opt, "include_usage", false);
params.cache_prompt = json_value(data, "cache_prompt", defaults.cache_prompt);
params.return_tokens = json_value(data, "return_tokens", false);
params.return_progress = json_value(data, "return_progress", false);
auto max_tokens = json_value(data, "max_tokens", defaults.n_predict);
params.n_predict = json_value(data, "n_predict", json_value(data, "max_completion_tokens", max_tokens));
params.n_indent = json_value(data, "n_indent", defaults.n_indent);
params.n_keep = json_value(data, "n_keep", defaults.n_keep);
params.n_discard = json_value(data, "n_discard", defaults.n_discard);
params.n_discard = std::max(0, params.n_discard);
params.n_cmpl = json_value(data, "n_cmpl", json_value(data, "n", 1));
params.n_cache_reuse = json_value(data, "n_cache_reuse", defaults.n_cache_reuse);
//params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
params.response_fields = json_value(data, "response_fields", std::vector<std::string>());
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p);
params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma);
params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability);
params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold);
params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p);
params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp);
params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range);
params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent);
params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n);
params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat);
params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq);
params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present);
params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier);
params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base);
params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length);
params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n);
params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat);
params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau);
params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta);
params.sampling.adaptive_target = json_value(data, "adaptive_target", defaults.sampling.adaptive_target);
params.sampling.adaptive_decay = json_value(data, "adaptive_decay", defaults.sampling.adaptive_decay);
params.sampling.seed = json_value(data, "seed", defaults.sampling.seed);
params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs);
params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);
params.sampling.backend_sampling = json_value(data, "backend_sampling", defaults.sampling.backend_sampling);
params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
params.speculative = defaults.speculative;
// TODO: to keep things simple, we disable speculative parameter adjustments for now
#if 0
// TODO: for now, be able to adjust only the draft-model based speculative parameters
params.speculative.draft.n_min = json_value(data, "speculative.n_min", defaults.speculative.draft.n_min);
params.speculative.draft.n_max = json_value(data, "speculative.n_max", defaults.speculative.draft.n_max);
params.speculative.draft.p_min = json_value(data, "speculative.p_min", defaults.speculative.draft.p_min);
params.speculative.draft.n_min = std::min(params.speculative.draft.n_max, params.speculative.draft.n_min);
params.speculative.draft.n_min = std::max(params.speculative.draft.n_min, 0);
params.speculative.draft.n_max = std::max(params.speculative.draft.n_max, 0);
// for debugging and research purposes
params.speculative.type = common_speculative_type_from_name(json_value(data, "speculative.type", common_speculative_type_to_str(defaults.speculative.type)));
params.speculative.ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n);
params.speculative.ngram_size_m = json_value(data, "speculative.ngram_size_m", defaults.speculative.ngram_size_m);
params.speculative.ngram_min_hits = json_value(data, "speculative.ngram_m_hits", defaults.speculative.ngram_min_hits);
params.speculative.ngram_size_n = std::max(std::min(1, (int) params.speculative.ngram_size_n), 1024);
params.speculative.ngram_size_m = std::max(std::min(1, (int) params.speculative.ngram_size_m), 1024);
params.speculative.ngram_min_hits = std::max(std::min(1, (int) params.speculative.ngram_min_hits), 1024);
#endif
// Use OpenAI API logprobs only if n_probs wasn't provided
if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);
}
if (data.contains("lora")) {
if (data.at("lora").is_array()) {
params.lora = parse_lora_request(data.at("lora"));
} else {
throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
}
} else {
params.lora = {};
}
// TODO: add more sanity checks for the input parameters
if (params.sampling.penalty_last_n < -1) {
throw std::runtime_error("Error: repeat_last_n must be >= -1");
}
if (params.sampling.dry_penalty_last_n < -1) {
throw std::runtime_error("Error: dry_penalty_last_n must be >= -1");
}
if (params.sampling.penalty_last_n == -1) {
// note: should be the slot's context and not the full context, but it's ok
params.sampling.penalty_last_n = n_ctx_slot;
}
if (params.sampling.dry_penalty_last_n == -1) {
params.sampling.dry_penalty_last_n = n_ctx_slot;
}
if (params.sampling.dry_base < 1.0f) {
params.sampling.dry_base = defaults.sampling.dry_base;
}
// sequence breakers for DRY
{
// Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format
// Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
if (data.contains("dry_sequence_breakers")) {
params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
if (params.sampling.dry_sequence_breakers.empty()) {
throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings");
}
}
}
// process "json_schema" and "grammar"
if (data.contains("json_schema") && !data.contains("grammar")) {
try {
auto schema = json_value(data, "json_schema", json::object());
SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
std::string grammar_str = json_schema_to_grammar(schema);
SRV_DBG("Converted grammar: %s\n", grammar_str.c_str());
params.sampling.grammar = {COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, std::move(grammar_str)};
} catch (const std::exception & e) {
throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
}
} else {
params.sampling.grammar = defaults.sampling.grammar;
std::string grammar_str = json_value(data, "grammar", std::string());
if (!grammar_str.empty()) {
// grammar_type key is set by the server when converting chat template grammars
std::string grammar_type = json_value(data, "grammar_type", std::string());
if (grammar_type == "tool_calls") {
params.sampling.grammar = {COMMON_GRAMMAR_TYPE_TOOL_CALLS, std::move(grammar_str)};
} else {
// explicit grammar from the user (API field "grammar")
params.sampling.grammar = {COMMON_GRAMMAR_TYPE_USER, std::move(grammar_str)};
}
SRV_DBG("Grammar (%s): %s\n", grammar_type.c_str(), common_grammar_value(params.sampling.grammar).c_str());
}
params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
}
{
auto it = data.find("chat_format");
if (it != data.end()) {
params.chat_parser_params.format = static_cast<common_chat_format>(it->get<int>());
SRV_INF("Chat format: %s\n", common_chat_format_name(params.chat_parser_params.format));
} else {
params.chat_parser_params.format = defaults.chat_parser_params.format;
}
common_reasoning_format reasoning_format = params_base.reasoning_format;
if (data.contains("reasoning_format")) {
reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
}
params.chat_parser_params.reasoning_format = reasoning_format;
params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
params.chat_parser_params.generation_prompt = json_value(data, "generation_prompt", std::string());
params.sampling.generation_prompt = params.chat_parser_params.generation_prompt;
SRV_DBG("Generation prompt: '%s'\n", params.chat_parser_params.generation_prompt.c_str());
params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
if (data.contains("chat_parser")) {
params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
}
if (data.contains("continue_final_message")) {
auto continuation = common_chat_continuation_parse(data.at("continue_final_message"));
params.chat_parser_params.is_continuation = continuation != COMMON_CHAT_CONTINUATION_NONE;
}
params.chat_parser_params.echo = json_value(data, "echo", false);
}
{
const auto preserved_tokens = data.find("preserved_tokens");
if (preserved_tokens != data.end()) {
for (const auto & t : *preserved_tokens) {
auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
if (ids.size() == 1) {
SRV_DBG("Preserved token: %d\n", ids[0]);
params.sampling.preserved_tokens.insert(ids[0]);
} else {
// This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
}
}
}
const auto grammar_triggers = data.find("grammar_triggers");
if (grammar_triggers != data.end()) {
for (const auto & t : *grammar_triggers) {
server_grammar_trigger ct(t);
if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
const auto & word = ct.value.value;
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
if (ids.size() == 1) {
auto token = ids[0];
if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
}
SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
common_grammar_trigger trigger;
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
trigger.value = word;
trigger.token = token;
params.sampling.grammar_triggers.push_back(std::move(trigger));
} else {
SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
}
} else {
if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) {
SRV_DBG("Grammar trigger pattern: `%s`\n", ct.value.value.c_str());
} else if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL) {
SRV_DBG("Grammar trigger pattern full: `%s`\n", ct.value.value.c_str());
} else {
throw std::runtime_error("Unknown grammar trigger type");
}
params.sampling.grammar_triggers.emplace_back(std::move(ct.value));
}
}
}
if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
throw std::runtime_error("Error: no triggers set for lazy grammar!");
}
}
// Parse reasoning budget sampler parameters
{
const int32_t budget = json_value(data, "reasoning_budget_tokens", (int32_t) -1);
const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string());
const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string());
const auto message = json_value(data, "reasoning_budget_message", std::string());
params.sampling.reasoning_budget_tokens = budget;
params.sampling.reasoning_control = json_value(data, "reasoning_control", false);
if (!start_tag.empty()) {
params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
}
if (!end_tag.empty()) {
params.sampling.reasoning_budget_end = common_tokenize(vocab, end_tag, false, true);
params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true);
SRV_DBG("reasoning budget: tokens=%d, generation_prompt='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n",
budget, params.sampling.generation_prompt.c_str(),
params.sampling.reasoning_budget_start.size(),
params.sampling.reasoning_budget_end.size(),
params.sampling.reasoning_budget_forced.size());
}
}
{
params.sampling.logit_bias.clear();
const auto & logit_bias = data.find("logit_bias");
if (logit_bias != data.end() && logit_bias->is_array()) {
const int n_vocab = llama_vocab_n_tokens(vocab);
for (const auto & el : *logit_bias) {
// TODO: we may want to throw errors here, in case "el" is incorrect
if (el.is_array() && el.size() == 2) {
float bias;
if (el[1].is_number()) {
bias = el[1].get<float>();
} else if (el[1].is_boolean() && !el[1].get<bool>()) {
bias = -INFINITY;
} else {
continue;
}
if (el[0].is_number_integer()) {
llama_token tok = el[0].get<llama_token>();
if (tok >= 0 && tok < n_vocab) {
params.sampling.logit_bias.push_back({tok, bias});
}
} else if (el[0].is_string()) {
auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
for (auto tok : toks) {
params.sampling.logit_bias.push_back({tok, bias});
}
}
}
}
} else if (logit_bias != data.end() && logit_bias->is_object()) {
const int n_vocab = llama_vocab_n_tokens(vocab);
for (const auto & el : logit_bias->items()) {
float bias;
const auto & key = el.key();
const auto & value = el.value();
if (value.is_number()) {
bias = value.get<float>();
} else if (value.is_boolean() && !value.get<bool>()) {
bias = -INFINITY;
} else {
continue;
}
char *end;
llama_token tok = strtol(key.c_str(), &end, 10);
if (*end == 0) {
if (tok >= 0 && tok < n_vocab) {
params.sampling.logit_bias.push_back({tok, bias});
}
} else {
auto toks = common_tokenize(vocab, key, false);
for (auto tok : toks) {
params.sampling.logit_bias.push_back({tok, bias});
}
}
}
}
params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
if (params.sampling.ignore_eos) {
params.sampling.logit_bias.insert(
params.sampling.logit_bias.end(),
logit_bias_eog.begin(), logit_bias_eog.end());
}
}
{
params.antiprompt.clear();
const auto & stop = data.find("stop");
if (stop != data.end() && stop->is_array()) {
for (const auto & word : *stop) {
if (!word.empty()) {
params.antiprompt.push_back(word);
}
}
}
// set reverse prompt from cli args if not set in the request
if (params.antiprompt.empty()) {
params.antiprompt = defaults.antiprompt;
}
}
{
const auto samplers = data.find("samplers");
if (samplers != data.end()) {
if (samplers->is_array()) {
params.sampling.samplers = common_sampler_types_from_names(*samplers, false);
} else if (samplers->is_string()){
params.sampling.samplers = common_sampler_types_from_chars(samplers->get<std::string>());
}
} else {
params.sampling.samplers = defaults.sampling.samplers;
}
}
if (params.n_cmpl > params_base.n_parallel) {
throw std::runtime_error("n_cmpl cannot be greater than the number of slots, please increase -np");
}
return params;
}
//
// result_timings
//
json result_timings::to_json() const {
json base = {
{"cache_n", cache_n},
{"prompt_n", prompt_n},
{"prompt_ms", prompt_ms},
{"prompt_per_token_ms", prompt_per_token_ms},
{"prompt_per_second", prompt_per_second},
{"predicted_n", predicted_n},
{"predicted_ms", predicted_ms},
{"predicted_per_token_ms", predicted_per_token_ms},
{"predicted_per_second", predicted_per_second},
};
if (draft_n > 0) {
base["draft_n"] = draft_n;
base["draft_n_accepted"] = draft_n_accepted;
}
return base;
}
//
// result_prompt_progress
//
json result_prompt_progress::to_json() const {
return json {
{"total", total},
{"cache", cache},
{"processed", processed},
{"time_ms", time_ms},
};
}
static inline std::string stop_type_to_str(stop_type type) {
switch (type) {
case STOP_TYPE_EOS: return "eos";
case STOP_TYPE_WORD: return "word";
case STOP_TYPE_LIMIT: return "limit";
default: return "none";
}
}
//
// completion_token_output
//
json completion_token_output::to_json(bool post_sampling_probs) const {
json probs_for_token = json::array();
for (const auto & p : probs) {
std::string txt(p.txt);
txt.resize(validate_utf8(txt));
probs_for_token.push_back(json {
{"id", p.tok},
{"token", txt},
{"bytes", str_to_bytes(p.txt)},
{
post_sampling_probs ? "prob" : "logprob",
post_sampling_probs ? p.prob : logarithm(p.prob)
},
});
}
return probs_for_token;
}
json completion_token_output::probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs) {
json out = json::array();
for (const auto & p : probs) {
std::string txt(p.text_to_send);
txt.resize(validate_utf8(txt));
out.push_back(json {
{"id", p.tok},
{"token", txt},
{"bytes", str_to_bytes(p.text_to_send)},
{
post_sampling_probs ? "prob" : "logprob",
post_sampling_probs ? p.prob : logarithm(p.prob)
},
{
post_sampling_probs ? "top_probs" : "top_logprobs",
p.to_json(post_sampling_probs)
},
});
}
return out;
}
float completion_token_output::logarithm(float x) {
// nlohmann::json converts -inf to null, so we need to prevent that
return x == 0.0f ? std::numeric_limits<float>::lowest() : std::log(x);
}
std::vector<unsigned char> completion_token_output::str_to_bytes(const std::string & str) {
std::vector<unsigned char> bytes;
for (unsigned char c : str) {
bytes.push_back(c);
}
return bytes;
}
//
// server_task_result_cmpl_final
//
json server_task_result_cmpl_final::to_json() {
GGML_ASSERT(is_updated && "update() must be called before to_json()");
switch (res_type) {
case TASK_RESPONSE_TYPE_NONE:
return to_json_non_oaicompat();
case TASK_RESPONSE_TYPE_OAI_CMPL:
return to_json_oaicompat();
case TASK_RESPONSE_TYPE_OAI_CHAT:
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
case TASK_RESPONSE_TYPE_OAI_RESP:
return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
case TASK_RESPONSE_TYPE_OAI_ASR:
return to_json_oaicompat_asr();
case TASK_RESPONSE_TYPE_ANTHROPIC:
return stream ? to_json_anthropic_stream() : to_json_anthropic();
default:
GGML_ASSERT(false && "Invalid task_response_type");
}
}
json server_task_result_cmpl_final::to_json_non_oaicompat() {
json res = json {
{"index", index},
{"content", content},
{"tokens", tokens},
{"id_slot", id_slot},
{"stop", true},
{"model", oaicompat_model},
{"tokens_predicted", n_decoded},
{"tokens_evaluated", n_prompt_tokens},
{"generation_settings", generation_params.to_json()},
{"prompt", prompt},
{"has_new_line", has_new_line},
{"truncated", truncated},
{"stop_type", stop_type_to_str(stop)},
{"stopping_word", stopping_word},
{"tokens_cached", n_tokens_cached},
{"timings", timings.to_json()},
};
if (!stream && !probs_output.empty()) {
res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
}
return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
}
json server_task_result_cmpl_final::usage_json_oaicompat() {
return json {
{"completion_tokens", n_decoded},
{"prompt_tokens", n_prompt_tokens},
{"total_tokens", n_decoded + n_prompt_tokens},
{"prompt_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
};
}
json server_task_result_cmpl_final::to_json_oaicompat() {
std::time_t t = std::time(0);
json logprobs = json(nullptr); // OAI default to null
if (!stream && probs_output.size() > 0) {
logprobs = json{
{"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
};
}
json finish_reason = "length";
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
finish_reason = "stop";
}
json res = json {
{"choices", json::array({
json{
{"text", content},
{"index", index},
{"logprobs", logprobs},
{"finish_reason", finish_reason},
}
})},
{"created", t},
{"model", oaicompat_model},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "text_completion"},
{"usage", usage_json_oaicompat()},
{"id", oaicompat_cmpl_id}
};
// extra fields for debugging purposes
if (verbose) {
res["__verbose"] = to_json_non_oaicompat();
}
if (timings.prompt_n >= 0) {
res.push_back({"timings", timings.to_json()});
}
return res;
}
json server_task_result_cmpl_final::to_json_oaicompat_chat() {
std::string finish_reason = "length";
common_chat_msg msg;
if (!oaicompat_msg.empty()) {
msg = oaicompat_msg;
} else {
msg.role = "assistant";
msg.content = content;
}
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
}
json choice {
{"finish_reason", finish_reason},
{"index", index},
{"message", msg.to_json_oaicompat()},
};
if (!stream && probs_output.size() > 0) {
choice["logprobs"] = json{
{"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
};
}
std::time_t t = std::time(0);
json res = json {
{"choices", json::array({choice})},
{"created", t},
{"model", oaicompat_model},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "chat.completion"},
{"usage", usage_json_oaicompat()},
{"id", oaicompat_cmpl_id}
};
// extra fields for debugging purposes
if (verbose) {
res["__verbose"] = to_json_non_oaicompat();
}
if (timings.prompt_n >= 0) {
res.push_back({"timings", timings.to_json()});
}
return res;
}
json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
std::time_t t = std::time(0);
std::string finish_reason = "length";
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
finish_reason = oaicompat_msg.tool_calls.empty() ? "stop" : "tool_calls";
}
json deltas = json::array();
for (const auto & diff : oaicompat_msg_diffs) {
deltas.push_back({
{"choices", json::array({
json {
{"finish_reason", nullptr},
{"index", index},
{"delta", server_chat_msg_diff_to_json_oaicompat(diff)},
},
})},
{"created", t},
{"id", oaicompat_cmpl_id},
{"model", oaicompat_model},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "chat.completion.chunk"},
});
}
deltas.push_back({
{"choices", json::array({
json {
{"finish_reason", finish_reason},
{"index", index},
{"delta", json::object()},
},
})},
{"created", t},
{"id", oaicompat_cmpl_id},
{"model", oaicompat_model},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "chat.completion.chunk"},
});
if (include_usage) {
// OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
// https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
deltas.push_back({
{"choices", json::array()},
{"created", t},
{"id", oaicompat_cmpl_id},
{"model", oaicompat_model},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "chat.completion.chunk"},
{"usage", usage_json_oaicompat()},
});
}
if (timings.prompt_n >= 0) {
deltas.back().push_back({"timings", timings.to_json()});
}
// extra fields for debugging purposes
if (verbose && !deltas.empty()) {
deltas.front()["__verbose"] = to_json_non_oaicompat();
}
return deltas;
}
json server_task_result_cmpl_final::to_json_oaicompat_resp() {
common_chat_msg msg;
if (!oaicompat_msg.empty()) {
msg = oaicompat_msg;
} else {
msg.role = "assistant";
msg.content = content;
}
std::vector<json> output;
if (msg.reasoning_content != "") {
output.push_back(json {
{"id", "rs_" + random_string()},
{"summary", json::array()},
{"type", "reasoning"},
{"content", json::array({ json {
{"text", msg.reasoning_content},
{"type", "reasoning_text"},
}})},
{"encrypted_content", ""},
{"status", "completed"},
});
}
if (msg.content != "") {
output.push_back(json {
{"content", json::array({ json {
{"type", "output_text"},
{"annotations", json::array()},
{"logprobs", json::array()},
{"text", msg.content},
}})},
{"id", "msg_" + random_string()},
{"role", msg.role},
{"status", "completed"},
{"type", "message"},
});
}
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
output.push_back(json {
{"type", "function_call"},
{"status", "completed"},
{"arguments", tool_call.arguments},
{"call_id", "fc_" + tool_call.id},
{"name", tool_call.name},
});
}
std::time_t t = std::time(0);
json res = {
{"completed_at", t},
{"created_at", t},
{"id", oai_resp_id},
{"model", oaicompat_model},
{"object", "response"},
{"output", output},
{"status", "completed"},
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens},
{"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
}},
};
return res;
}
json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
std::vector<json> server_sent_events;
std::vector<json> output;
if (oaicompat_msg.reasoning_content != "") {
const json output_item = json {
{"id", oai_resp_reasoning_id},
{"summary", json::array()},
{"type", "reasoning"},
{"content", json::array({ json {
{"text", oaicompat_msg.reasoning_content},
{"type", "reasoning_text"},
}})},
{"encrypted_content", ""},
};
server_sent_events.push_back(json {
{"event", "response.output_item.done"},
{"data", json {
{"type", "response.output_item.done"},
{"item", output_item}
}}
});
output.push_back(output_item);
}
if (oaicompat_msg.content != "") {
server_sent_events.push_back(json {
{"event", "response.output_text.done"},
{"data", json {
{"type", "response.output_text.done"},
{"item_id", oai_resp_message_id},
{"text", oaicompat_msg.content}
}}
});
const json content_part = {
{"type", "output_text"},
{"annotations", json::array()},
{"logprobs", json::array()},
{"text", oaicompat_msg.content}
};
server_sent_events.push_back(json {
{"event", "response.content_part.done"},
{"data", json {
{"type", "response.content_part.done"},
{"item_id", oai_resp_message_id},
{"part", content_part}
}}
});
const json output_item = {
{"type", "message"},
{"status", "completed"},
{"id", oai_resp_message_id},
{"content", json::array({content_part})},
{"role", "assistant"}
};
server_sent_events.push_back(json {
{"event", "response.output_item.done"},
{"data", json {
{"type", "response.output_item.done"},
{"item", output_item}
}}
});
output.push_back(output_item);
}
for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
const json output_item = {
{"type", "function_call"},
{"status", "completed"},
{"arguments", tool_call.arguments},
{"call_id", "fc_" + tool_call.id},
{"name", tool_call.name}
};
server_sent_events.push_back(json {
{"event", "response.output_item.done"},
{"data", json {
{"type", "response.output_item.done"},
{"item", output_item}
}}
});
output.push_back(output_item);
}
std::time_t t = std::time(0);
server_sent_events.push_back(json {
{"event", "response.completed"},
{"data", json {
{"type", "response.completed"},
{"response", json {
{"id", oai_resp_id},
{"object", "response"},
{"created_at", t},
{"status", "completed"},
{"model", oaicompat_model},
{"output", output},
{"usage", json {
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens},
{"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
}}
}},
}}
});
return server_sent_events;
}
json server_task_result_cmpl_final::to_json_oaicompat_asr() {
json event = json {
{"type", "transcript.text.done"},
{"text", oaicompat_msg.content},
{"usage", json {
{"type", "tokens"},
{"input_tokens", n_prompt_tokens},
{"output_tokens", n_decoded},
{"total_tokens", n_decoded + n_prompt_tokens},
{"input_tokens_details", json { {"cached_tokens", n_prompt_tokens_cache} }},
}},
};
return event;
}
json server_task_result_cmpl_final::to_json_anthropic() {
std::string stop_reason = "max_tokens";
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
}
json content_blocks = json::array();
common_chat_msg msg;
if (!oaicompat_msg.empty()) {
msg = oaicompat_msg;
} else {
msg.role = "assistant";
msg.content = content;
}
// thinking block comes first (Anthropic extended thinking format)
if (!msg.reasoning_content.empty()) {
content_blocks.push_back({
{"type", "thinking"},
{"thinking", msg.reasoning_content},
{"signature", ""} // empty signature for local models (no cryptographic verification)
});
}
if (!msg.content.empty()) {
content_blocks.push_back({
{"type", "text"},
{"text", msg.content}
});
}
for (const auto & tool_call : msg.tool_calls) {
json tool_use_block = {
{"type", "tool_use"},
{"id", tool_call.id},
{"name", tool_call.name}
};
try {
tool_use_block["input"] = json::parse(tool_call.arguments);
} catch (const std::exception &) {
tool_use_block["input"] = json::object();
}
content_blocks.push_back(tool_use_block);
}
json res = {
{"id", oaicompat_cmpl_id},
{"type", "message"},
{"role", "assistant"},
{"content", content_blocks},
{"model", oaicompat_model},
{"stop_reason", stop_reason},
{"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)},
{"usage", {
{"cache_read_input_tokens", n_prompt_tokens_cache},
{"input_tokens", n_prompt_tokens - n_prompt_tokens_cache},
{"output_tokens", n_decoded}
}}
};
return res;
}
json server_task_result_cmpl_final::to_json_anthropic_stream() {
json events = json::array();
std::string stop_reason = "max_tokens";
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
}
bool has_thinking = !oaicompat_msg.reasoning_content.empty();
bool has_text = !oaicompat_msg.content.empty();
size_t num_tool_calls = oaicompat_msg.tool_calls.size();
// content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
size_t thinking_block_index = 0;
size_t text_block_index = has_thinking ? 1 : 0;
bool thinking_block_started = false;
bool text_block_started = false;
std::unordered_set<size_t> tool_calls_started;
for (const auto & diff : oaicompat_msg_diffs) {
// handle thinking/reasoning content
if (!diff.reasoning_content_delta.empty()) {
if (!thinking_block_started) {
events.push_back({
{"event", "content_block_start"},
{"data", {
{"type", "content_block_start"},
{"index", thinking_block_index},
{"content_block", {
{"type", "thinking"},
{"thinking", ""}
}}
}}
});
thinking_block_started = true;
}
events.push_back({
{"event", "content_block_delta"},
{"data", {
{"type", "content_block_delta"},
{"index", thinking_block_index},
{"delta", {
{"type", "thinking_delta"},
{"thinking", diff.reasoning_content_delta}
}}
}}
});
}
// handle regular text content
if (!diff.content_delta.empty()) {
if (!text_block_started) {
events.push_back({
{"event", "content_block_start"},
{"data", {
{"type", "content_block_start"},
{"index", text_block_index},
{"content_block", {
{"type", "text"},
{"text", ""}
}}
}}
});
text_block_started = true;
}
events.push_back({
{"event", "content_block_delta"},
{"data", {
{"type", "content_block_delta"},
{"index", text_block_index},
{"delta", {
{"type", "text_delta"},
{"text", diff.content_delta}
}}
}}
});
}
// handle tool calls
if (diff.tool_call_index != std::string::npos) {
size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + diff.tool_call_index;
if (tool_calls_started.find(diff.tool_call_index) == tool_calls_started.end()) {
const auto & full_tool_call = oaicompat_msg.tool_calls[diff.tool_call_index];
events.push_back({
{"event", "content_block_start"},
{"data", {
{"type", "content_block_start"},
{"index", content_block_index},
{"content_block", {
{"type", "tool_use"},
{"id", full_tool_call.id},
{"name", full_tool_call.name}
}}
}}
});
tool_calls_started.insert(diff.tool_call_index);
}
if (!diff.tool_call_delta.arguments.empty()) {
events.push_back({
{"event", "content_block_delta"},
{"data", {
{"type", "content_block_delta"},
{"index", content_block_index},
{"delta", {
{"type", "input_json_delta"},
{"partial_json", diff.tool_call_delta.arguments}
}}
}}
});
}
}
}
// close content blocks in order
if (has_thinking) {
// Anthropic API requires a signature_delta before closing thinking blocks
// We use an empty signature since we can't generate a cryptographic signature for local models
events.push_back({
{"event", "content_block_delta"},
{"data", {
{"type", "content_block_delta"},
{"index", thinking_block_index},
{"delta", {
{"type", "signature_delta"},
{"signature", ""}
}}
}}
});
events.push_back({
{"event", "content_block_stop"},
{"data", {
{"type", "content_block_stop"},
{"index", thinking_block_index}
}}
});
}
if (has_text) {
events.push_back({
{"event", "content_block_stop"},
{"data", {
{"type", "content_block_stop"},
{"index", text_block_index}
}}
});
}
for (size_t i = 0; i < num_tool_calls; i++) {
size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + i;
events.push_back({
{"event", "content_block_stop"},
{"data", {
{"type", "content_block_stop"},
{"index", content_block_index}
}}
});
}
events.push_back({
{"event", "message_delta"},
{"data", {
{"type", "message_delta"},
{"delta", {
{"stop_reason", stop_reason},
{"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)}
}},
{"usage", {
{"output_tokens", n_decoded}
}}
}}
});
events.push_back({
{"event", "message_stop"},
{"data", {
{"type", "message_stop"}
}}
});
return events;
}
//
// server_task_result_cmpl_partial
//
void server_task_result_cmpl_partial::update(task_result_state & state) {
is_updated = true;
state.update_chat_msg(content, true, oaicompat_msg_diffs);
// Copy current state for use in to_json_*() (reflects state BEFORE this chunk)
thinking_block_started = state.thinking_block_started;
text_block_started = state.text_block_started;
oai_resp_id = state.oai_resp_id;
oai_resp_reasoning_id = state.oai_resp_reasoning_id;
oai_resp_message_id = state.oai_resp_message_id;
oai_resp_fc_id = state.oai_resp_fc_id;
// track if the accumulated message has any reasoning content
anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
// Pre-compute state updates based on diffs (for next chunk)
for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
if (!diff.reasoning_content_delta.empty() && !state.thinking_block_started) {
state.thinking_block_started = true;
}
if (!diff.content_delta.empty() && !state.text_block_started) {
state.text_block_started = true;
}
if (!diff.tool_call_delta.name.empty()) {
state.oai_resp_fc_id = diff.tool_call_delta.id;
}
}
}
json server_task_result_cmpl_partial::to_json() {
GGML_ASSERT(is_updated && "update() must be called before to_json()");
if (is_begin) {
return nullptr; // simply signal to HTTP handler to send the headers and status code
}
switch (res_type) {
case TASK_RESPONSE_TYPE_NONE:
return to_json_non_oaicompat();
case TASK_RESPONSE_TYPE_OAI_CMPL:
return to_json_oaicompat();
case TASK_RESPONSE_TYPE_OAI_CHAT:
return to_json_oaicompat_chat();
case TASK_RESPONSE_TYPE_OAI_RESP:
return to_json_oaicompat_resp();
case TASK_RESPONSE_TYPE_OAI_ASR:
return to_json_oaicompat_asr();
case TASK_RESPONSE_TYPE_ANTHROPIC:
return to_json_anthropic();
default:
GGML_ASSERT(false && "Invalid task_response_type");
}
}
json server_task_result_cmpl_partial::to_json_non_oaicompat() {
// non-OAI-compat JSON
json res = json {
{"index", index},
{"content", content},
{"tokens", tokens},
{"stop", false},
{"id_slot", id_slot},
{"tokens_predicted", n_decoded},
{"tokens_evaluated", n_prompt_tokens},
};
// populate the timings object when needed (usually for the last response or with timings_per_token enabled)
if (timings.prompt_n > 0) {
res.push_back({"timings", timings.to_json()});
}
if (is_progress) {
res.push_back({"prompt_progress", progress.to_json()});
}
if (!prob_output.probs.empty()) {
res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs);
}
return res;
}
json server_task_result_cmpl_partial::to_json_oaicompat() {
std::time_t t = std::time(0);
json logprobs = json(nullptr); // OAI default to null
if (prob_output.probs.size() > 0) {
logprobs = json{
{"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
};
}
json res = json {
{"choices", json::array({
json{
{"text", content},
{"index", index},
{"logprobs", logprobs},
{"finish_reason", nullptr},
}
})},
{"created", t},
{"model", oaicompat_model},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "text_completion"},
{"id", oaicompat_cmpl_id}
};
// extra fields for debugging purposes
if (verbose) {
res["__verbose"] = to_json_non_oaicompat();
}
if (timings.prompt_n >= 0) {
res.push_back({"timings", timings.to_json()});
}
if (is_progress) {
res.push_back({"prompt_progress", progress.to_json()});
}
return res;
}
json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
bool first = n_decoded == 1;
std::time_t t = std::time(0);
json choices;
std::vector<json> deltas;
auto add_delta = [&](const json & delta) {
deltas.push_back({
{"choices", json::array({
json {
{"finish_reason", nullptr},
{"index", index},
{"delta", delta},
},
})},
{"created", t},
{"id", oaicompat_cmpl_id},
{"model", oaicompat_model},
{"system_fingerprint", std::string(llama_build_info())},
{"object", "chat.completion.chunk"},
});
};
// We have to send an initial update to conform to openai behavior
if (first || is_progress) {
add_delta({
{"role", "assistant"},
{"content", nullptr},
});
}
for (const auto & diff : oaicompat_msg_diffs) {
add_delta(server_chat_msg_diff_to_json_oaicompat(diff));
}
if (!deltas.empty()) {
auto & last_json = deltas[deltas.size() - 1];
GGML_ASSERT(last_json.at("choices").size() >= 1);
if (prob_output.probs.size() > 0) {
last_json.at("choices").at(0)["logprobs"] = json {
{"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
};
}
if (timings.prompt_n >= 0) {
last_json.push_back({"timings", timings.to_json()});
}
if (is_progress) {
last_json.push_back({"prompt_progress", progress.to_json()});
}
}
return deltas;
}
json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
std::vector<json> events;
if (n_decoded == 1) {
events.push_back(json {
{"event", "response.created"},
{"data", json {
{"type", "response.created"},
{"response", json {
{"id", oai_resp_id},
{"object", "response"},
{"status", "in_progress"},
}},
}},
});
events.push_back(json {
{"event", "response.in_progress"},
{"data", json {
{"type", "response.in_progress"},
{"response", json {
{"id", oai_resp_id},
{"object", "response"},
{"status", "in_progress"},
}},
}},
});
}
for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
if (!diff.reasoning_content_delta.empty()) {
if (!thinking_block_started) {
events.push_back(json {
{"event", "response.output_item.added"},
{"data", json {
{"type", "response.output_item.added"},
{"item", json {
{"id", oai_resp_reasoning_id},
{"summary", json::array()},
{"type", "reasoning"},
{"content", json::array()},
{"encrypted_content", ""},
{"status", "in_progress"},
}},
}},
});
thinking_block_started = true;
}
events.push_back(json {
{"event", "response.reasoning_text.delta"},
{"data", json {
{"type", "response.reasoning_text.delta"},
{"delta", diff.reasoning_content_delta},
{"item_id", oai_resp_reasoning_id},
}},
});
}
if (!diff.content_delta.empty()) {
if (!text_block_started) {
events.push_back(json {
{"event", "response.output_item.added"},
{"data", json {
{"type", "response.output_item.added"},
{"item", json {
{"content", json::array()},
{"id", oai_resp_message_id},
{"role", "assistant"},
{"status", "in_progress"},
{"type", "message"},
}},
}},
});
events.push_back(json {
{"event", "response.content_part.added"},
{"data", json {
{"type", "response.content_part.added"},
{"item_id", oai_resp_message_id},
{"part", json {
{"type", "output_text"},
{"text", ""},
}},
}},
});
text_block_started = true;
}
events.push_back(json {
{"event", "response.output_text.delta"},
{"data", json {
{"type", "response.output_text.delta"},
{"item_id", oai_resp_message_id},
{"delta", diff.content_delta},
}},
});
}
if (!diff.tool_call_delta.name.empty()) {
events.push_back(json {
{"event", "response.output_item.added"},
{"data", json {
{"type", "response.output_item.added"},
{"item", json {
{"arguments", ""},
{"call_id", "fc_" + diff.tool_call_delta.id},
{"name", diff.tool_call_delta.name},
{"type", "function_call"},
{"status", "in_progress"},
}},
}},
});
oai_resp_fc_id = diff.tool_call_delta.id;
}
if (!diff.tool_call_delta.arguments.empty()) {
events.push_back(json {
{"event", "response.function_call_arguments.delta"},
{"data", json {
{"type", "response.function_call_arguments.delta"},
{"delta", diff.tool_call_delta.arguments},
{"item_id", "fc_" + oai_resp_fc_id},
}},
});
}
}
return events;
}
json server_task_result_cmpl_partial::to_json_oaicompat_asr() {
json event = json {
{"type", "transcript.text.delta"},
{"delta", content},
};
return event;
}
json server_task_result_cmpl_partial::to_json_anthropic() {
json events = json::array();
bool first = (n_decoded == 1);
// use member variables to track block state across streaming calls
// (anthropic_thinking_block_started, anthropic_text_block_started)
if (first) {
events.push_back({
{"event", "message_start"},
{"data", {
{"type", "message_start"},
{"message", {
{"id", oaicompat_cmpl_id},
{"type", "message"},
{"role", "assistant"},
{"content", json::array()},
{"model", oaicompat_model},
{"stop_reason", nullptr},
{"stop_sequence", nullptr},
{"usage", {
{"cache_read_input_tokens", n_prompt_tokens_cache},
{"input_tokens", n_prompt_tokens - n_prompt_tokens_cache},
{"output_tokens", 0}
}}
}}
}}
});
}
// content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
size_t thinking_block_index = 0;
// use anthropic_has_reasoning (set in update()) to know if ANY reasoning was generated
size_t text_block_index = anthropic_has_reasoning ? 1 : 0;
// use local copies of streaming state (copied from task_result_state in update())
// these reflect the state BEFORE this chunk was processed
bool thinking_started = thinking_block_started;
bool text_started = text_block_started;
for (const auto & diff : oaicompat_msg_diffs) {
// handle thinking/reasoning content
if (!diff.reasoning_content_delta.empty()) {
if (!thinking_started) {
events.push_back({
{"event", "content_block_start"},
{"data", {
{"type", "content_block_start"},
{"index", thinking_block_index},
{"content_block", {
{"type", "thinking"},
{"thinking", ""}
}}
}}
});
thinking_started = true;
}
events.push_back({
{"event", "content_block_delta"},
{"data", {
{"type", "content_block_delta"},
{"index", thinking_block_index},
{"delta", {
{"type", "thinking_delta"},
{"thinking", diff.reasoning_content_delta}
}}
}}
});
}
// handle regular text content
if (!diff.content_delta.empty()) {
if (!text_started) {
events.push_back({
{"event", "content_block_start"},
{"data", {
{"type", "content_block_start"},
{"index", text_block_index},
{"content_block", {
{"type", "text"},
{"text", ""}
}}
}}
});
text_started = true;
}
events.push_back({
{"event", "content_block_delta"},
{"data", {
{"type", "content_block_delta"},
{"index", text_block_index},
{"delta", {
{"type", "text_delta"},
{"text", diff.content_delta}
}}
}}
});
}
// handle tool calls
if (diff.tool_call_index != std::string::npos) {
// use anthropic_has_reasoning for thinking block count (persists across calls)
size_t content_block_index = (anthropic_has_reasoning ? 1 : 0) + (text_started ? 1 : 0) + diff.tool_call_index;
if (!diff.tool_call_delta.name.empty()) {
events.push_back({
{"event", "content_block_start"},
{"data", {
{"type", "content_block_start"},
{"index", content_block_index},
{"content_block", {
{"type", "tool_use"},
{"id", diff.tool_call_delta.id},
{"name", diff.tool_call_delta.name}
}}
}}
});
}
if (!diff.tool_call_delta.arguments.empty()) {
events.push_back({
{"event", "content_block_delta"},
{"data", {
{"type", "content_block_delta"},
{"index", content_block_index},
{"delta", {
{"type", "input_json_delta"},
{"partial_json", diff.tool_call_delta.arguments}
}}
}}
});
}
}
}
return events;
}
//
// server_task_result_embd
//
json server_task_result_embd::to_json() {
return res_type == TASK_RESPONSE_TYPE_OAI_EMBD
? to_json_oaicompat()
: to_json_non_oaicompat();
}
json server_task_result_embd::to_json_non_oaicompat() {
return json {
{"index", index},
{"embedding", embedding},
};
}
json server_task_result_embd::to_json_oaicompat() {
return json {
{"index", index},
{"embedding", embedding[0]},
{"tokens_evaluated", n_tokens},
};
}
//
// server_task_result_rerank
//
json server_task_result_rerank::to_json() {
return json {
{"index", index},
{"score", score},
{"tokens_evaluated", n_tokens},
};
}
//
// server_task_result_error
//
json server_task_result_error::to_json() {
json res = format_error_response(err_msg, err_type);
if (err_type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) {
res["n_prompt_tokens"] = n_prompt_tokens;
res["n_ctx"] = n_ctx;
}
return res;
}
//
// server_task_result_metrics
//
json server_task_result_metrics::to_json() {
return json {
{ "idle", n_idle_slots },
{ "processing", n_processing_slots },
{ "deferred", n_tasks_deferred },
{ "t_start", t_start },
{ "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
{ "t_tokens_generation_total", t_tokens_generation_total },
{ "n_tokens_predicted_total", n_tokens_predicted_total },
{ "t_prompt_processing_total", t_prompt_processing_total },
{ "n_tokens_max", n_tokens_max },
{ "n_prompt_tokens_processed", n_prompt_tokens_processed },
{ "t_prompt_processing", t_prompt_processing },
{ "n_tokens_predicted", n_tokens_predicted },
{ "t_tokens_generation", t_tokens_generation },
{ "n_decode_total", n_decode_total },
{ "n_busy_slots_total", n_busy_slots_total },
{ "slots", slots_data },
};
}
//
// server_task_result_slot_save_load
//
json server_task_result_slot_save_load::to_json() {
if (is_save) {
return json {
{ "id_slot", id_slot },
{ "filename", filename },
{ "n_saved", n_tokens },
{ "n_written", n_bytes },
{ "timings", {
{ "save_ms", t_ms }
}},
};
}
return json {
{ "id_slot", id_slot },
{ "filename", filename },
{ "n_restored", n_tokens },
{ "n_read", n_bytes },
{ "timings", {
{ "restore_ms", t_ms }
}},
};
}
//
// server_task_result_slot_erase
//
json server_task_result_slot_erase::to_json() {
return json {
{ "id_slot", id_slot },
{ "n_erased", n_erased },
};
}
//
// server_task_result_get_lora
//
json server_task_result_get_lora::to_json() {
json result = json::array();
for (size_t i = 0; i < loras.size(); ++i) {
auto & lora = loras[i];
json entry = {
{"id", i},
{"path", lora.info.path},
{"scale", lora.info.scale},
{"task_name", lora.info.task_name},
{"prompt_prefix", lora.info.prompt_prefix},
};
if (!lora.alora_invocation_tokens.empty()) {
entry["alora_invocation_string"] = lora.alora_invocation_string;
entry["alora_invocation_tokens"] = lora.alora_invocation_tokens;
}
result.push_back(std::move(entry));
}
return result;
}
//
// server_task_result_apply_lora
//
json server_task_result_apply_lora::to_json() {
return json {{ "success", true }};
}
//
// server_prompt_cache
//
size_t server_prompt_cache::size() const {
size_t res = 0;
for (const auto & state : states) {
res += state.size();
}
return res;
}
size_t server_prompt_cache::n_tokens() const {
size_t res = 0;
for (const auto & state : states) {
res += state.n_tokens();
}
return res;
}
server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t state_size_tgt, size_t state_size_dft) {
// first check if the current state is contained fully in the cache
for (auto it = states.begin(); it != states.end(); ++it) {
const int cur_lcp_len = it->tokens.get_common_prefix(prompt.tokens);
if (cur_lcp_len == (int) prompt.tokens.size()) {
SRV_INF("%s", " - prompt is already in the cache, skipping\n");
return nullptr;
}
}
// next, remove any cached prompts that are fully contained in the current prompt
for (auto it = states.begin(); it != states.end();) {
const int len = it->tokens.get_common_prefix(prompt.tokens);
if (len == (int) it->tokens.size()) {
SRV_WRN(" - removing obsolete cached prompt with length %d\n", len);
it = states.erase(it);
} else {
++it;
}
}
std::vector<uint8_t> state_data_tgt;
std::vector<uint8_t> state_data_dft;
// check if we can allocate enough memory for the new state
try {
state_data_tgt.resize(state_size_tgt);
state_data_dft.resize(state_size_dft);
} catch (const std::bad_alloc & e) {
SRV_ERR("failed to allocate memory for prompt cache state: %s\n", e.what());
limit_size = std::max<size_t>(1, 0.4*size());
SRV_WRN(" - cache size limit reduced to %.3f MiB\n", limit_size / (1024.0 * 1024.0));
update();
return nullptr;
}
states.push_back({
/*.tokens =*/ prompt.tokens.clone(),
/*.data =*/ {
/*.main =*/ std::move(state_data_tgt),
/*.drft =*/ std::move(state_data_dft),
},
/*.checkpoints =*/ prompt.checkpoints,
});
return &states.back();
}
bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx_tgt, llama_context * ctx_dft, int32_t id_slot) {
const int lcp_best = prompt.tokens.get_common_prefix(tokens_new);
float f_keep_best = prompt.tokens.size() > 0 ? float(lcp_best) / prompt.tokens.size() : -1.0f; // empty slot: any cache entry wins
float sim_best = float(lcp_best) / tokens_new.size();
SRV_INF(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
auto it_best = states.end();
// find the most similar cached prompt, that would also preserve the most context
for (auto it = states.begin(); it != states.end(); ++it) {
const int lcp_cur = it->tokens.get_common_prefix(tokens_new);
const float f_keep_cur = float(lcp_cur) / it->tokens.size();
const float sim_cur = float(lcp_cur) / tokens_new.size();
// don't trash large prompts
if (f_keep_cur < 0.25f) {
continue;
}
if (f_keep_best < f_keep_cur && sim_best < sim_cur) {
f_keep_best = f_keep_cur;
sim_best = sim_cur;
it_best = it;
}
}
if (it_best != states.end()) {
SRV_INF(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
{
auto & data = it_best->data.main;
const size_t size = data.size();
const size_t n = llama_state_seq_set_data_ext(ctx_tgt, data.data(), size, id_slot, 0);
if (n != size) {
SRV_ERR("failed to restore state with size %zu\n", size);
return false;
}
data.clear();
data.shrink_to_fit();
}
{
auto & data = it_best->data.drft;
if (!data.empty()) {
GGML_ASSERT(ctx_dft);
const size_t size = data.size();
const size_t n = llama_state_seq_set_data_ext(ctx_dft, data.data(), size, id_slot, 0);
if (n != size) {
SRV_WRN("failed to restore state with size %zu\n", size);
return false;
}
data.clear();
data.shrink_to_fit();
}
}
prompt = std::move(*it_best);
states.erase(it_best);
}
return true;
}
void server_prompt_cache::update() {
if (limit_size > 0) {
// always keep at least one state, regardless of the limits
while (states.size() > 1 && size() > limit_size) {
if (states.empty()) {
break;
}
SRV_WRN(" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
states.pop_front();
}
}
// average size per token
const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
// dynamically increase the token limit if it can fit in the memory limit
const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
if (limit_tokens > 0) {
while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
if (states.empty()) {
break;
}
SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));
states.pop_front();
}
}
SRV_INF(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
for (const auto & state : states) {
SRV_INF(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
(const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
}
}