[Rust Frontend] Support include_reasoning=false (#44391)

Signed-off-by: RickyChen / 陳昭儒 <ricky.chen@infinirc.com>
This commit is contained in:
Chao-Ju Chen
2026-06-05 16:47:50 +08:00
committed by GitHub
parent d61d8566ec
commit e64237ae82
4 changed files with 544 additions and 26 deletions
@@ -85,6 +85,7 @@ pub async fn chat_completions(
log_request,
prepared.include_usage,
prepared.requested_logprobs,
prepared.include_reasoning,
prepared.echo,
prepared.return_token_ids,
prepared.return_tokens_as_token_ids,
@@ -100,6 +101,7 @@ pub async fn chat_completions(
created,
prepared.requested_logprobs,
prepared.include_prompt_logprobs,
prepared.include_reasoning,
prepared.echo,
prepared.return_token_ids,
prepared.return_tokens_as_token_ids,
@@ -134,6 +136,7 @@ async fn collect_chat_completion(
created: u64,
requested_logprobs: bool,
include_prompt_logprobs: bool,
include_reasoning: bool,
echo: Option<String>,
return_token_ids: bool,
return_tokens_as_token_ids: bool,
@@ -157,6 +160,11 @@ async fn collect_chat_completion(
} = collected;
let stop_reason = finish_reason.as_stop_reason().map(stop_reason_to_json);
let saw_tool_calls = message.tool_calls().next().is_some();
let reasoning = message.reasoning();
// Output logprobs and token IDs cover the complete generated token stream.
// When reasoning is hidden, omit them rather than leaking hidden reasoning
// tokens through per-token metadata.
let include_output_metadata = include_reasoning || reasoning.is_none();
let finish_reason = chat_finish_reason_to_openai(&finish_reason, saw_tool_calls)?.to_string();
let tool_calls = message
.tool_calls()
@@ -169,7 +177,7 @@ async fn collect_chat_completion(
},
})
.collect::<Vec<_>>();
let logprobs = if requested_logprobs {
let logprobs = if requested_logprobs && include_output_metadata {
Some(decoded_logprobs_to_openai_chat(
logprobs.as_ref().ok_or_else(|| {
server_error!("chat response requested logprobs but generation returned none")
@@ -207,12 +215,12 @@ async fn collect_chat_completion(
None => Some(message.text()).filter(|t| !t.is_empty()),
},
tool_calls: Some(tool_calls).filter(|calls| !calls.is_empty()),
reasoning: message.reasoning(),
reasoning: if include_reasoning { reasoning } else { None },
},
logprobs,
finish_reason: Some(finish_reason),
stop_reason,
token_ids: return_token_ids.then_some(token_ids),
token_ids: (return_token_ids && include_output_metadata).then_some(token_ids),
}],
usage: Some(usage),
system_fingerprint: None,
@@ -232,12 +240,18 @@ async fn chat_completion_chunk_stream(
log_request: bool,
include_usage: bool,
requested_logprobs: bool,
include_reasoning: bool,
echo: Option<String>,
return_token_ids: bool,
return_tokens_as_token_ids: bool,
mut y: TryYielder<ChatCompletionStreamResponse, ApiError>,
) -> Result<(), ApiError> {
let mut saw_tool_calls = false;
// `LogprobsDelta` is emitted after all chat events for one decoded update.
// If that update contains hidden reasoning, including delimiter-only block
// starts or ends, omit its token metadata as well as its visible delta.
let mut inside_hidden_reasoning = false;
let mut suppress_current_update_metadata = false;
// If the client requested logprobs or token_ids, we need to buffer chunks until
// we receive the separate `LogprobsDelta` event, so that we can emit one
@@ -268,29 +282,44 @@ async fn chat_completion_chunk_stream(
}
}
Ok(ChatEvent::BlockDelta { kind, delta, .. }) => {
if let Some(pending_chunk) = pending_chunk.as_mut() {
pending_chunk.push_block_delta(kind, delta);
let include_delta =
include_reasoning || !matches!(kind, AssistantBlockKind::Reasoning);
if include_delta {
if let Some(pending_chunk) = pending_chunk.as_mut() {
pending_chunk.push_block_delta(kind, delta);
} else {
y.yield_ok(block_delta_chunk(
&request_id,
&response_model,
created,
kind,
delta,
))
.await;
}
} else {
y.yield_ok(block_delta_chunk(
&request_id,
&response_model,
created,
kind,
delta,
))
.await;
suppress_current_update_metadata = true;
}
}
Ok(ChatEvent::LogprobsDelta {
logprobs,
token_ids,
}) => {
let openai_logprobs = logprobs
.as_ref()
.map(|lp| decoded_logprobs_to_openai_chat(lp, return_tokens_as_token_ids))
.transpose()?;
let openai_token_ids =
return_token_ids.then_some(token_ids).filter(|t| !t.is_empty());
let include_metadata =
!suppress_current_update_metadata && !inside_hidden_reasoning;
suppress_current_update_metadata = false;
let openai_logprobs = if include_metadata {
logprobs
.as_ref()
.map(|lp| decoded_logprobs_to_openai_chat(lp, return_tokens_as_token_ids))
.transpose()?
} else {
None
};
let openai_token_ids = include_metadata
.then_some(token_ids)
.and_then(|token_ids| return_token_ids.then_some(token_ids))
.filter(|t| !t.is_empty());
if let Some(pending_chunk) = pending_chunk.as_mut() {
pending_chunk.logprobs = openai_logprobs;
pending_chunk.token_ids = openai_token_ids;
@@ -311,9 +340,17 @@ async fn chat_completion_chunk_stream(
}
Ok(ChatEvent::BlockStart { kind, .. }) => {
debug!(?kind, "starting new block");
if !include_reasoning && matches!(kind, AssistantBlockKind::Reasoning) {
inside_hidden_reasoning = true;
suppress_current_update_metadata = true;
}
}
Ok(ChatEvent::BlockEnd { .. }) => {
debug!("ending current block");
if inside_hidden_reasoning {
inside_hidden_reasoning = false;
suppress_current_update_metadata = true;
}
}
Ok(ChatEvent::ToolCallStart { index, id, name }) => {
let tool_index = index as u32;
@@ -763,7 +800,9 @@ fn stop_reason_to_json(stop_reason: &StopReason) -> Value {
mod tests {
use futures::{StreamExt as _, stream};
use serde_json::json;
use vllm_chat::{AssistantBlockKind, AssistantToolCall, ChatEvent, FinishReason};
use vllm_chat::{
AssistantBlockKind, AssistantContentBlock, AssistantToolCall, ChatEvent, FinishReason,
};
use vllm_engine_core_client::protocol::StopReason;
use vllm_text::{DecodedLogprobs, DecodedPositionLogprobs, DecodedTokenLogprob};
@@ -895,6 +934,7 @@ mod tests {
false,
false,
true,
true,
None,
false,
false,
@@ -958,6 +998,7 @@ mod tests {
false,
false,
true,
true,
None,
false,
false,
@@ -976,6 +1017,292 @@ mod tests {
assert!(chunks[1].choices[0].logprobs.is_some());
}
#[tokio::test]
async fn chunk_stream_omits_reasoning_delta_when_disabled() {
let stream = stream::iter(vec![
Ok(ChatEvent::Start {
prompt_token_ids: vec![].into(),
prompt_logprobs: None,
}),
Ok(ChatEvent::BlockDelta {
index: 0,
kind: AssistantBlockKind::Reasoning,
delta: "think".to_string(),
}),
Ok(ChatEvent::BlockDelta {
index: 1,
kind: AssistantBlockKind::Text,
delta: "answer".to_string(),
}),
Ok(ChatEvent::Done {
message: Default::default(),
prompt_token_count: 1,
output_token_count: 2,
finish_reason: FinishReason::stop_eos(),
kv_transfer_params: None,
}),
]);
let chunks = chat_completion_chunk_stream(
stream,
"chatcmpl-1".to_string(),
"model".to_string(),
1,
false,
false,
false,
false,
None,
false,
false,
)
.collect::<Vec<_>>()
.await
.into_iter()
.collect::<Result<Vec<_>, _>>()
.expect("stream chunks");
assert_eq!(chunks.len(), 3);
assert_eq!(
chunks[1].choices[0].delta.content.as_deref(),
Some("answer")
);
assert!(
chunks
.iter()
.all(|chunk| chunk.choices.iter().all(|choice| choice.delta.reasoning.is_none()))
);
}
#[tokio::test]
async fn chunk_stream_omits_logprobs_for_suppressed_reasoning() {
let stream = stream::iter(vec![
Ok(ChatEvent::Start {
prompt_token_ids: vec![].into(),
prompt_logprobs: None,
}),
Ok(ChatEvent::BlockDelta {
index: 0,
kind: AssistantBlockKind::Reasoning,
delta: "think".to_string(),
}),
Ok(ChatEvent::LogprobsDelta {
logprobs: Some(DecodedLogprobs {
positions: vec![DecodedPositionLogprobs {
entries: vec![DecodedTokenLogprob {
token_id: 11,
token: "think".to_string(),
logprob: -0.1,
rank: 1,
}],
}],
}),
token_ids: vec![11],
}),
Ok(ChatEvent::BlockDelta {
index: 1,
kind: AssistantBlockKind::Text,
delta: "answer".to_string(),
}),
Ok(ChatEvent::LogprobsDelta {
logprobs: Some(DecodedLogprobs {
positions: vec![DecodedPositionLogprobs {
entries: vec![DecodedTokenLogprob {
token_id: 22,
token: "answer".to_string(),
logprob: -0.2,
rank: 1,
}],
}],
}),
token_ids: vec![22],
}),
Ok(ChatEvent::Done {
message: Default::default(),
prompt_token_count: 1,
output_token_count: 2,
finish_reason: FinishReason::stop_eos(),
kv_transfer_params: None,
}),
]);
let chunks = chat_completion_chunk_stream(
stream,
"chatcmpl-1".to_string(),
"model".to_string(),
1,
false,
false,
true,
false,
None,
true,
false,
)
.collect::<Vec<_>>()
.await
.into_iter()
.collect::<Result<Vec<_>, _>>()
.expect("stream chunks");
assert_eq!(chunks.len(), 3);
let choice = &chunks[1].choices[0];
assert_eq!(choice.delta.content.as_deref(), Some("answer"));
assert_eq!(choice.token_ids.as_deref(), Some(&[22][..]));
let logprobs = choice.logprobs.as_ref().expect("answer logprobs");
let content = logprobs.content.as_ref().expect("logprobs content");
assert_eq!(content[0].token, "answer");
assert!(chunks.iter().all(|chunk| {
chunk.choices.iter().all(|choice| {
choice.delta.reasoning.is_none()
&& choice.token_ids.as_deref() != Some(&[11][..])
&& choice
.logprobs
.as_ref()
.and_then(|logprobs| logprobs.content.as_ref())
.is_none_or(|content| content.iter().all(|entry| entry.token != "think"))
})
}));
}
#[tokio::test]
async fn chunk_stream_omits_logprobs_for_hidden_reasoning_delimiters() {
let stream = stream::iter(vec![
Ok(ChatEvent::Start {
prompt_token_ids: vec![].into(),
prompt_logprobs: None,
}),
Ok(ChatEvent::BlockStart {
index: 0,
kind: AssistantBlockKind::Reasoning,
}),
Ok(ChatEvent::LogprobsDelta {
logprobs: Some(DecodedLogprobs {
positions: vec![DecodedPositionLogprobs {
entries: vec![DecodedTokenLogprob {
token_id: 11,
token: "<think>".to_string(),
logprob: -0.1,
rank: 1,
}],
}],
}),
token_ids: vec![11],
}),
Ok(ChatEvent::BlockDelta {
index: 0,
kind: AssistantBlockKind::Reasoning,
delta: "think".to_string(),
}),
Ok(ChatEvent::LogprobsDelta {
logprobs: Some(DecodedLogprobs {
positions: vec![DecodedPositionLogprobs {
entries: vec![DecodedTokenLogprob {
token_id: 12,
token: "think".to_string(),
logprob: -0.2,
rank: 1,
}],
}],
}),
token_ids: vec![12],
}),
Ok(ChatEvent::BlockEnd {
index: 0,
block: AssistantContentBlock::Reasoning {
text: "think".to_string(),
},
}),
Ok(ChatEvent::LogprobsDelta {
logprobs: Some(DecodedLogprobs {
positions: vec![DecodedPositionLogprobs {
entries: vec![DecodedTokenLogprob {
token_id: 13,
token: "</think>".to_string(),
logprob: -0.3,
rank: 1,
}],
}],
}),
token_ids: vec![13],
}),
Ok(ChatEvent::BlockStart {
index: 1,
kind: AssistantBlockKind::Text,
}),
Ok(ChatEvent::BlockDelta {
index: 1,
kind: AssistantBlockKind::Text,
delta: "answer".to_string(),
}),
Ok(ChatEvent::LogprobsDelta {
logprobs: Some(DecodedLogprobs {
positions: vec![DecodedPositionLogprobs {
entries: vec![DecodedTokenLogprob {
token_id: 22,
token: "answer".to_string(),
logprob: -0.4,
rank: 1,
}],
}],
}),
token_ids: vec![22],
}),
Ok(ChatEvent::Done {
message: Default::default(),
prompt_token_count: 1,
output_token_count: 4,
finish_reason: FinishReason::stop_eos(),
kv_transfer_params: None,
}),
]);
let chunks = chat_completion_chunk_stream(
stream,
"chatcmpl-1".to_string(),
"model".to_string(),
1,
false,
false,
true,
false,
None,
true,
false,
)
.collect::<Vec<_>>()
.await
.into_iter()
.collect::<Result<Vec<_>, _>>()
.expect("stream chunks");
assert_eq!(chunks.len(), 3);
let choice = &chunks[1].choices[0];
assert_eq!(choice.delta.content.as_deref(), Some("answer"));
assert_eq!(choice.token_ids.as_deref(), Some(&[22][..]));
let logprobs = choice.logprobs.as_ref().expect("answer logprobs");
let content = logprobs.content.as_ref().expect("logprobs content");
assert_eq!(content[0].token, "answer");
assert!(chunks.iter().all(|chunk| {
chunk.choices.iter().all(|choice| {
choice.delta.reasoning.is_none()
&& !choice
.token_ids
.as_ref()
.is_some_and(|ids| matches!(ids.as_slice(), [11] | [12] | [13]))
&& choice
.logprobs
.as_ref()
.and_then(|logprobs| logprobs.content.as_ref())
.is_none_or(|content| {
content.iter().all(|entry| {
!matches!(entry.token.as_str(), "<think>" | "think" | "</think>")
})
})
})
}));
}
#[tokio::test]
async fn chunk_stream_preserves_tool_call_index_and_omits_id_from_arguments_delta() {
let stream = stream::iter(vec![
@@ -1017,6 +1344,7 @@ mod tests {
false,
false,
false,
true,
None,
false,
false,
@@ -29,6 +29,8 @@ pub struct PreparedRequest {
pub requested_logprobs: bool,
/// Whether the caller requested top-level prompt logprobs.
pub include_prompt_logprobs: bool,
/// Whether to include reasoning content in OpenAI responses.
pub include_reasoning: bool,
/// Lowered chat request for `vllm-chat`.
pub chat_request: ChatRequest,
/// Last assistant-role message content to echo back when `echo=true`.
@@ -57,6 +59,7 @@ pub(crate) fn prepare_chat_request(
.as_ref()
.map(|request| request.lora_name.clone())
.unwrap_or_else(|| lora_resolution.model_names.first().cloned().unwrap_or_default());
let include_reasoning = request.include_reasoning;
let echo = request
.echo
.then(|| extract_last_assistant_content(&request.messages))
@@ -146,6 +149,7 @@ pub(crate) fn prepare_chat_request(
include_usage,
requested_logprobs,
include_prompt_logprobs,
include_reasoning,
chat_request,
echo,
return_token_ids: request.return_token_ids.unwrap_or(false),
@@ -480,6 +484,23 @@ mod tests {
assert_eq!(prepared.chat_request.tool_choice, ChatToolChoice::Auto);
}
#[test]
fn prepare_chat_request_preserves_include_reasoning_false() {
let request = ChatCompletionRequest {
include_reasoning: false,
..base_request()
};
let prepared = prepare_chat_request(
request,
&served(&["Qwen/Qwen1.5-0.5B-Chat"]),
ResolvedRequestContext::default(),
)
.expect("request is valid");
assert!(!prepared.include_reasoning);
}
#[test]
fn prepare_chat_request_preserves_sampling_passthrough_fields() {
let request = ChatCompletionRequest {
@@ -120,12 +120,6 @@ pub(super) fn validate_request_compat(
"thinking_token_budget",
"thinking_token_budget is not supported.",
)?;
if !request.include_reasoning {
bail_invalid_request!(
param = "include_reasoning",
"include_reasoning is not supported."
);
}
reject_non_default(
request.media_io_kwargs.as_ref(),
"media_io_kwargs",
@@ -312,6 +306,17 @@ mod tests {
.expect("reasoning_effort should be accepted");
}
#[test]
fn validate_request_compat_accepts_include_reasoning_false() {
let request = ChatCompletionRequest {
include_reasoning: false,
..base_request()
};
validate_request_compat(&request, &served(&["Qwen/Qwen1.5-0.5B-Chat"]))
.expect("include_reasoning=false should be accepted");
}
#[test]
fn validate_request_compat_rejects_top_logprobs_without_logprobs() {
let request = ChatCompletionRequest {
+164
View File
@@ -3492,6 +3492,170 @@ async fn reasoning_blocks_are_mapped_to_reasoning_sse_chunks() {
assert!(text.contains("\"content\":\"answer\""), "{text}");
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
#[serial]
async fn include_reasoning_false_suppresses_reasoning_in_non_stream_chat() {
let (app, engine_task) = test_app_with_backend_and_stream_output_specs(
Arc::new(FakeChatBackend::with_model_id("Qwen/Qwen3-0.6B")),
vec![
(bytes_to_token_ids(b"<think>think</think>"), None),
(
bytes_to_token_ids(b"answer"),
Some(EngineCoreFinishReason::Length),
),
],
)
.await;
let response = app
.clone()
.call(
Request::builder()
.method("POST")
.uri("/v1/chat/completions")
.header("content-type", "application/json")
.body(Body::from(
json!({
"model": "Qwen/Qwen1.5-0.5B-Chat",
"stream": false,
"include_reasoning": false,
"messages": [{"role": "user", "content": "hello"}]
})
.to_string(),
))
.expect("build request"),
)
.await
.expect("call app");
assert_eq!(response.status(), StatusCode::OK);
let body = to_bytes(response.into_body(), usize::MAX).await.expect("read body");
engine_task.await.expect("mock engine task");
let text = String::from_utf8(body.to_vec()).expect("utf8 body");
let json: serde_json::Value = serde_json::from_str(&text).expect("decode json");
assert_eq!(json["choices"][0]["message"]["content"], "answer");
assert!(
json["choices"][0]["message"]
.as_object()
.is_some_and(|message| !message.contains_key("reasoning")),
"{text}"
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
#[serial]
async fn include_reasoning_false_suppresses_non_stream_output_metadata() {
let ipc = IpcNamespace::new().expect("create ipc namespace");
let handshake_address = ipc.handshake_endpoint();
let engine_id = b"engine-openai-hidden-reasoning-logprobs".to_vec();
let engine_task = MockEngineTask::new(spawn_mock_engine_task(
handshake_address.clone(),
engine_id.clone(),
|dealer, push| {
boxed_test_future(async move {
let add = recv_engine_message(dealer).await;
let request: EngineCoreRequest =
rmp_serde::from_slice(&add[1]).expect("decode request");
let reasoning_token_ids = bytes_to_token_ids(b"<think>think</think>");
let answer_token_ids = bytes_to_token_ids(b"answer");
send_outputs(
push,
EngineCoreOutputs {
engine_index: 0,
outputs: vec![
request_output_with_logprobs(
&request.request_id,
reasoning_token_ids.clone(),
None,
None,
Some(sample_logprobs_for_tokens(&reasoning_token_ids)),
None,
),
request_output_with_logprobs(
&request.request_id,
answer_token_ids.clone(),
Some(EngineCoreFinishReason::Length),
None,
Some(sample_logprobs_for_tokens(&answer_token_ids)),
None,
),
],
scheduler_stats: None,
timestamp: 0.0,
utility_output: None,
finished_requests: None,
wave_complete: None,
start_wave: None,
},
)
.await;
})
},
));
let client = EngineCoreClient::connect(
EngineCoreClientConfig::new_single(handshake_address)
.with_model_name("test-model")
.with_local_input_output_addresses(
Some(ipc.input_endpoint()),
Some(ipc.output_endpoint()),
),
)
.await
.expect("connect client");
let chat = ChatLlm::from_shared_backend(
test_llm(client),
Arc::new(FakeChatBackend::with_model_id("Qwen/Qwen3-0.6B")),
);
let mut app = build_router(Arc::new(AppState::new(
vec!["Qwen/Qwen1.5-0.5B-Chat".to_string()],
chat,
)));
let response = app
.call(
Request::builder()
.method("POST")
.uri("/v1/chat/completions")
.header("content-type", "application/json")
.body(Body::from(
json!({
"model": "Qwen/Qwen1.5-0.5B-Chat",
"stream": false,
"include_reasoning": false,
"logprobs": true,
"return_token_ids": true,
"messages": [{"role": "user", "content": "hello"}]
})
.to_string(),
))
.expect("build request"),
)
.await
.expect("call app");
assert_eq!(response.status(), StatusCode::OK);
let body = to_bytes(response.into_body(), usize::MAX).await.expect("read body");
engine_task.await.expect("mock engine task");
let text = String::from_utf8(body.to_vec()).expect("utf8 body");
let json: serde_json::Value = serde_json::from_str(&text).expect("decode json");
let choice = json["choices"][0].as_object().expect("choice object");
assert_eq!(json["choices"][0]["message"]["content"], "answer");
assert!(
json["choices"][0]["message"]
.as_object()
.is_some_and(|message| !message.contains_key("reasoning")),
"{text}"
);
assert!(!choice.contains_key("logprobs"), "{text}");
assert!(!choice.contains_key("token_ids"), "{text}");
assert!(json["prompt_token_ids"].is_array(), "{text}");
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
#[serial]
async fn tool_calls_are_mapped_to_tool_call_sse_chunks() {