From e64237ae82c9fa8abf9fabf96c2c4a9dcbfcd040 Mon Sep 17 00:00:00 2001 From: Chao-Ju Chen Date: Fri, 5 Jun 2026 16:47:50 +0800 Subject: [PATCH] [Rust Frontend] Support include_reasoning=false (#44391) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: RickyChen / 陳昭儒 --- .../src/routes/openai/chat_completions.rs | 368 +++++++++++++++++- .../routes/openai/chat_completions/convert.rs | 21 + .../openai/chat_completions/validate.rs | 17 +- rust/src/server/src/routes/tests.rs | 164 ++++++++ 4 files changed, 544 insertions(+), 26 deletions(-) diff --git a/rust/src/server/src/routes/openai/chat_completions.rs b/rust/src/server/src/routes/openai/chat_completions.rs index 543a7e806c4..0cccd8bf4ab 100644 --- a/rust/src/server/src/routes/openai/chat_completions.rs +++ b/rust/src/server/src/routes/openai/chat_completions.rs @@ -85,6 +85,7 @@ pub async fn chat_completions( log_request, prepared.include_usage, prepared.requested_logprobs, + prepared.include_reasoning, prepared.echo, prepared.return_token_ids, prepared.return_tokens_as_token_ids, @@ -100,6 +101,7 @@ pub async fn chat_completions( created, prepared.requested_logprobs, prepared.include_prompt_logprobs, + prepared.include_reasoning, prepared.echo, prepared.return_token_ids, prepared.return_tokens_as_token_ids, @@ -134,6 +136,7 @@ async fn collect_chat_completion( created: u64, requested_logprobs: bool, include_prompt_logprobs: bool, + include_reasoning: bool, echo: Option, return_token_ids: bool, return_tokens_as_token_ids: bool, @@ -157,6 +160,11 @@ async fn collect_chat_completion( } = collected; let stop_reason = finish_reason.as_stop_reason().map(stop_reason_to_json); let saw_tool_calls = message.tool_calls().next().is_some(); + let reasoning = message.reasoning(); + // Output logprobs and token IDs cover the complete generated token stream. + // When reasoning is hidden, omit them rather than leaking hidden reasoning + // tokens through per-token metadata. + let include_output_metadata = include_reasoning || reasoning.is_none(); let finish_reason = chat_finish_reason_to_openai(&finish_reason, saw_tool_calls)?.to_string(); let tool_calls = message .tool_calls() @@ -169,7 +177,7 @@ async fn collect_chat_completion( }, }) .collect::>(); - let logprobs = if requested_logprobs { + let logprobs = if requested_logprobs && include_output_metadata { Some(decoded_logprobs_to_openai_chat( logprobs.as_ref().ok_or_else(|| { server_error!("chat response requested logprobs but generation returned none") @@ -207,12 +215,12 @@ async fn collect_chat_completion( None => Some(message.text()).filter(|t| !t.is_empty()), }, tool_calls: Some(tool_calls).filter(|calls| !calls.is_empty()), - reasoning: message.reasoning(), + reasoning: if include_reasoning { reasoning } else { None }, }, logprobs, finish_reason: Some(finish_reason), stop_reason, - token_ids: return_token_ids.then_some(token_ids), + token_ids: (return_token_ids && include_output_metadata).then_some(token_ids), }], usage: Some(usage), system_fingerprint: None, @@ -232,12 +240,18 @@ async fn chat_completion_chunk_stream( log_request: bool, include_usage: bool, requested_logprobs: bool, + include_reasoning: bool, echo: Option, return_token_ids: bool, return_tokens_as_token_ids: bool, mut y: TryYielder, ) -> Result<(), ApiError> { let mut saw_tool_calls = false; + // `LogprobsDelta` is emitted after all chat events for one decoded update. + // If that update contains hidden reasoning, including delimiter-only block + // starts or ends, omit its token metadata as well as its visible delta. + let mut inside_hidden_reasoning = false; + let mut suppress_current_update_metadata = false; // If the client requested logprobs or token_ids, we need to buffer chunks until // we receive the separate `LogprobsDelta` event, so that we can emit one @@ -268,29 +282,44 @@ async fn chat_completion_chunk_stream( } } Ok(ChatEvent::BlockDelta { kind, delta, .. }) => { - if let Some(pending_chunk) = pending_chunk.as_mut() { - pending_chunk.push_block_delta(kind, delta); + let include_delta = + include_reasoning || !matches!(kind, AssistantBlockKind::Reasoning); + if include_delta { + if let Some(pending_chunk) = pending_chunk.as_mut() { + pending_chunk.push_block_delta(kind, delta); + } else { + y.yield_ok(block_delta_chunk( + &request_id, + &response_model, + created, + kind, + delta, + )) + .await; + } } else { - y.yield_ok(block_delta_chunk( - &request_id, - &response_model, - created, - kind, - delta, - )) - .await; + suppress_current_update_metadata = true; } } Ok(ChatEvent::LogprobsDelta { logprobs, token_ids, }) => { - let openai_logprobs = logprobs - .as_ref() - .map(|lp| decoded_logprobs_to_openai_chat(lp, return_tokens_as_token_ids)) - .transpose()?; - let openai_token_ids = - return_token_ids.then_some(token_ids).filter(|t| !t.is_empty()); + let include_metadata = + !suppress_current_update_metadata && !inside_hidden_reasoning; + suppress_current_update_metadata = false; + let openai_logprobs = if include_metadata { + logprobs + .as_ref() + .map(|lp| decoded_logprobs_to_openai_chat(lp, return_tokens_as_token_ids)) + .transpose()? + } else { + None + }; + let openai_token_ids = include_metadata + .then_some(token_ids) + .and_then(|token_ids| return_token_ids.then_some(token_ids)) + .filter(|t| !t.is_empty()); if let Some(pending_chunk) = pending_chunk.as_mut() { pending_chunk.logprobs = openai_logprobs; pending_chunk.token_ids = openai_token_ids; @@ -311,9 +340,17 @@ async fn chat_completion_chunk_stream( } Ok(ChatEvent::BlockStart { kind, .. }) => { debug!(?kind, "starting new block"); + if !include_reasoning && matches!(kind, AssistantBlockKind::Reasoning) { + inside_hidden_reasoning = true; + suppress_current_update_metadata = true; + } } Ok(ChatEvent::BlockEnd { .. }) => { debug!("ending current block"); + if inside_hidden_reasoning { + inside_hidden_reasoning = false; + suppress_current_update_metadata = true; + } } Ok(ChatEvent::ToolCallStart { index, id, name }) => { let tool_index = index as u32; @@ -763,7 +800,9 @@ fn stop_reason_to_json(stop_reason: &StopReason) -> Value { mod tests { use futures::{StreamExt as _, stream}; use serde_json::json; - use vllm_chat::{AssistantBlockKind, AssistantToolCall, ChatEvent, FinishReason}; + use vllm_chat::{ + AssistantBlockKind, AssistantContentBlock, AssistantToolCall, ChatEvent, FinishReason, + }; use vllm_engine_core_client::protocol::StopReason; use vllm_text::{DecodedLogprobs, DecodedPositionLogprobs, DecodedTokenLogprob}; @@ -895,6 +934,7 @@ mod tests { false, false, true, + true, None, false, false, @@ -958,6 +998,7 @@ mod tests { false, false, true, + true, None, false, false, @@ -976,6 +1017,292 @@ mod tests { assert!(chunks[1].choices[0].logprobs.is_some()); } + #[tokio::test] + async fn chunk_stream_omits_reasoning_delta_when_disabled() { + let stream = stream::iter(vec![ + Ok(ChatEvent::Start { + prompt_token_ids: vec![].into(), + prompt_logprobs: None, + }), + Ok(ChatEvent::BlockDelta { + index: 0, + kind: AssistantBlockKind::Reasoning, + delta: "think".to_string(), + }), + Ok(ChatEvent::BlockDelta { + index: 1, + kind: AssistantBlockKind::Text, + delta: "answer".to_string(), + }), + Ok(ChatEvent::Done { + message: Default::default(), + prompt_token_count: 1, + output_token_count: 2, + finish_reason: FinishReason::stop_eos(), + kv_transfer_params: None, + }), + ]); + + let chunks = chat_completion_chunk_stream( + stream, + "chatcmpl-1".to_string(), + "model".to_string(), + 1, + false, + false, + false, + false, + None, + false, + false, + ) + .collect::>() + .await + .into_iter() + .collect::, _>>() + .expect("stream chunks"); + + assert_eq!(chunks.len(), 3); + assert_eq!( + chunks[1].choices[0].delta.content.as_deref(), + Some("answer") + ); + assert!( + chunks + .iter() + .all(|chunk| chunk.choices.iter().all(|choice| choice.delta.reasoning.is_none())) + ); + } + + #[tokio::test] + async fn chunk_stream_omits_logprobs_for_suppressed_reasoning() { + let stream = stream::iter(vec![ + Ok(ChatEvent::Start { + prompt_token_ids: vec![].into(), + prompt_logprobs: None, + }), + Ok(ChatEvent::BlockDelta { + index: 0, + kind: AssistantBlockKind::Reasoning, + delta: "think".to_string(), + }), + Ok(ChatEvent::LogprobsDelta { + logprobs: Some(DecodedLogprobs { + positions: vec![DecodedPositionLogprobs { + entries: vec![DecodedTokenLogprob { + token_id: 11, + token: "think".to_string(), + logprob: -0.1, + rank: 1, + }], + }], + }), + token_ids: vec![11], + }), + Ok(ChatEvent::BlockDelta { + index: 1, + kind: AssistantBlockKind::Text, + delta: "answer".to_string(), + }), + Ok(ChatEvent::LogprobsDelta { + logprobs: Some(DecodedLogprobs { + positions: vec![DecodedPositionLogprobs { + entries: vec![DecodedTokenLogprob { + token_id: 22, + token: "answer".to_string(), + logprob: -0.2, + rank: 1, + }], + }], + }), + token_ids: vec![22], + }), + Ok(ChatEvent::Done { + message: Default::default(), + prompt_token_count: 1, + output_token_count: 2, + finish_reason: FinishReason::stop_eos(), + kv_transfer_params: None, + }), + ]); + + let chunks = chat_completion_chunk_stream( + stream, + "chatcmpl-1".to_string(), + "model".to_string(), + 1, + false, + false, + true, + false, + None, + true, + false, + ) + .collect::>() + .await + .into_iter() + .collect::, _>>() + .expect("stream chunks"); + + assert_eq!(chunks.len(), 3); + let choice = &chunks[1].choices[0]; + assert_eq!(choice.delta.content.as_deref(), Some("answer")); + assert_eq!(choice.token_ids.as_deref(), Some(&[22][..])); + let logprobs = choice.logprobs.as_ref().expect("answer logprobs"); + let content = logprobs.content.as_ref().expect("logprobs content"); + assert_eq!(content[0].token, "answer"); + assert!(chunks.iter().all(|chunk| { + chunk.choices.iter().all(|choice| { + choice.delta.reasoning.is_none() + && choice.token_ids.as_deref() != Some(&[11][..]) + && choice + .logprobs + .as_ref() + .and_then(|logprobs| logprobs.content.as_ref()) + .is_none_or(|content| content.iter().all(|entry| entry.token != "think")) + }) + })); + } + + #[tokio::test] + async fn chunk_stream_omits_logprobs_for_hidden_reasoning_delimiters() { + let stream = stream::iter(vec![ + Ok(ChatEvent::Start { + prompt_token_ids: vec![].into(), + prompt_logprobs: None, + }), + Ok(ChatEvent::BlockStart { + index: 0, + kind: AssistantBlockKind::Reasoning, + }), + Ok(ChatEvent::LogprobsDelta { + logprobs: Some(DecodedLogprobs { + positions: vec![DecodedPositionLogprobs { + entries: vec![DecodedTokenLogprob { + token_id: 11, + token: "".to_string(), + logprob: -0.1, + rank: 1, + }], + }], + }), + token_ids: vec![11], + }), + Ok(ChatEvent::BlockDelta { + index: 0, + kind: AssistantBlockKind::Reasoning, + delta: "think".to_string(), + }), + Ok(ChatEvent::LogprobsDelta { + logprobs: Some(DecodedLogprobs { + positions: vec![DecodedPositionLogprobs { + entries: vec![DecodedTokenLogprob { + token_id: 12, + token: "think".to_string(), + logprob: -0.2, + rank: 1, + }], + }], + }), + token_ids: vec![12], + }), + Ok(ChatEvent::BlockEnd { + index: 0, + block: AssistantContentBlock::Reasoning { + text: "think".to_string(), + }, + }), + Ok(ChatEvent::LogprobsDelta { + logprobs: Some(DecodedLogprobs { + positions: vec![DecodedPositionLogprobs { + entries: vec![DecodedTokenLogprob { + token_id: 13, + token: "".to_string(), + logprob: -0.3, + rank: 1, + }], + }], + }), + token_ids: vec![13], + }), + Ok(ChatEvent::BlockStart { + index: 1, + kind: AssistantBlockKind::Text, + }), + Ok(ChatEvent::BlockDelta { + index: 1, + kind: AssistantBlockKind::Text, + delta: "answer".to_string(), + }), + Ok(ChatEvent::LogprobsDelta { + logprobs: Some(DecodedLogprobs { + positions: vec![DecodedPositionLogprobs { + entries: vec![DecodedTokenLogprob { + token_id: 22, + token: "answer".to_string(), + logprob: -0.4, + rank: 1, + }], + }], + }), + token_ids: vec![22], + }), + Ok(ChatEvent::Done { + message: Default::default(), + prompt_token_count: 1, + output_token_count: 4, + finish_reason: FinishReason::stop_eos(), + kv_transfer_params: None, + }), + ]); + + let chunks = chat_completion_chunk_stream( + stream, + "chatcmpl-1".to_string(), + "model".to_string(), + 1, + false, + false, + true, + false, + None, + true, + false, + ) + .collect::>() + .await + .into_iter() + .collect::, _>>() + .expect("stream chunks"); + + assert_eq!(chunks.len(), 3); + let choice = &chunks[1].choices[0]; + assert_eq!(choice.delta.content.as_deref(), Some("answer")); + assert_eq!(choice.token_ids.as_deref(), Some(&[22][..])); + let logprobs = choice.logprobs.as_ref().expect("answer logprobs"); + let content = logprobs.content.as_ref().expect("logprobs content"); + assert_eq!(content[0].token, "answer"); + assert!(chunks.iter().all(|chunk| { + chunk.choices.iter().all(|choice| { + choice.delta.reasoning.is_none() + && !choice + .token_ids + .as_ref() + .is_some_and(|ids| matches!(ids.as_slice(), [11] | [12] | [13])) + && choice + .logprobs + .as_ref() + .and_then(|logprobs| logprobs.content.as_ref()) + .is_none_or(|content| { + content.iter().all(|entry| { + !matches!(entry.token.as_str(), "" | "think" | "") + }) + }) + }) + })); + } + #[tokio::test] async fn chunk_stream_preserves_tool_call_index_and_omits_id_from_arguments_delta() { let stream = stream::iter(vec![ @@ -1017,6 +1344,7 @@ mod tests { false, false, false, + true, None, false, false, diff --git a/rust/src/server/src/routes/openai/chat_completions/convert.rs b/rust/src/server/src/routes/openai/chat_completions/convert.rs index 2701bef809c..ba75c9c1a8c 100644 --- a/rust/src/server/src/routes/openai/chat_completions/convert.rs +++ b/rust/src/server/src/routes/openai/chat_completions/convert.rs @@ -29,6 +29,8 @@ pub struct PreparedRequest { pub requested_logprobs: bool, /// Whether the caller requested top-level prompt logprobs. pub include_prompt_logprobs: bool, + /// Whether to include reasoning content in OpenAI responses. + pub include_reasoning: bool, /// Lowered chat request for `vllm-chat`. pub chat_request: ChatRequest, /// Last assistant-role message content to echo back when `echo=true`. @@ -57,6 +59,7 @@ pub(crate) fn prepare_chat_request( .as_ref() .map(|request| request.lora_name.clone()) .unwrap_or_else(|| lora_resolution.model_names.first().cloned().unwrap_or_default()); + let include_reasoning = request.include_reasoning; let echo = request .echo .then(|| extract_last_assistant_content(&request.messages)) @@ -146,6 +149,7 @@ pub(crate) fn prepare_chat_request( include_usage, requested_logprobs, include_prompt_logprobs, + include_reasoning, chat_request, echo, return_token_ids: request.return_token_ids.unwrap_or(false), @@ -480,6 +484,23 @@ mod tests { assert_eq!(prepared.chat_request.tool_choice, ChatToolChoice::Auto); } + #[test] + fn prepare_chat_request_preserves_include_reasoning_false() { + let request = ChatCompletionRequest { + include_reasoning: false, + ..base_request() + }; + + let prepared = prepare_chat_request( + request, + &served(&["Qwen/Qwen1.5-0.5B-Chat"]), + ResolvedRequestContext::default(), + ) + .expect("request is valid"); + + assert!(!prepared.include_reasoning); + } + #[test] fn prepare_chat_request_preserves_sampling_passthrough_fields() { let request = ChatCompletionRequest { diff --git a/rust/src/server/src/routes/openai/chat_completions/validate.rs b/rust/src/server/src/routes/openai/chat_completions/validate.rs index fbd10eea0cb..379f2c2d39b 100644 --- a/rust/src/server/src/routes/openai/chat_completions/validate.rs +++ b/rust/src/server/src/routes/openai/chat_completions/validate.rs @@ -120,12 +120,6 @@ pub(super) fn validate_request_compat( "thinking_token_budget", "thinking_token_budget is not supported.", )?; - if !request.include_reasoning { - bail_invalid_request!( - param = "include_reasoning", - "include_reasoning is not supported." - ); - } reject_non_default( request.media_io_kwargs.as_ref(), "media_io_kwargs", @@ -312,6 +306,17 @@ mod tests { .expect("reasoning_effort should be accepted"); } + #[test] + fn validate_request_compat_accepts_include_reasoning_false() { + let request = ChatCompletionRequest { + include_reasoning: false, + ..base_request() + }; + + validate_request_compat(&request, &served(&["Qwen/Qwen1.5-0.5B-Chat"])) + .expect("include_reasoning=false should be accepted"); + } + #[test] fn validate_request_compat_rejects_top_logprobs_without_logprobs() { let request = ChatCompletionRequest { diff --git a/rust/src/server/src/routes/tests.rs b/rust/src/server/src/routes/tests.rs index a3e437e0480..0a269f4ce17 100644 --- a/rust/src/server/src/routes/tests.rs +++ b/rust/src/server/src/routes/tests.rs @@ -3492,6 +3492,170 @@ async fn reasoning_blocks_are_mapped_to_reasoning_sse_chunks() { assert!(text.contains("\"content\":\"answer\""), "{text}"); } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +#[serial] +async fn include_reasoning_false_suppresses_reasoning_in_non_stream_chat() { + let (app, engine_task) = test_app_with_backend_and_stream_output_specs( + Arc::new(FakeChatBackend::with_model_id("Qwen/Qwen3-0.6B")), + vec![ + (bytes_to_token_ids(b"think"), None), + ( + bytes_to_token_ids(b"answer"), + Some(EngineCoreFinishReason::Length), + ), + ], + ) + .await; + + let response = app + .clone() + .call( + Request::builder() + .method("POST") + .uri("/v1/chat/completions") + .header("content-type", "application/json") + .body(Body::from( + json!({ + "model": "Qwen/Qwen1.5-0.5B-Chat", + "stream": false, + "include_reasoning": false, + "messages": [{"role": "user", "content": "hello"}] + }) + .to_string(), + )) + .expect("build request"), + ) + .await + .expect("call app"); + + assert_eq!(response.status(), StatusCode::OK); + let body = to_bytes(response.into_body(), usize::MAX).await.expect("read body"); + engine_task.await.expect("mock engine task"); + let text = String::from_utf8(body.to_vec()).expect("utf8 body"); + let json: serde_json::Value = serde_json::from_str(&text).expect("decode json"); + + assert_eq!(json["choices"][0]["message"]["content"], "answer"); + assert!( + json["choices"][0]["message"] + .as_object() + .is_some_and(|message| !message.contains_key("reasoning")), + "{text}" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +#[serial] +async fn include_reasoning_false_suppresses_non_stream_output_metadata() { + let ipc = IpcNamespace::new().expect("create ipc namespace"); + let handshake_address = ipc.handshake_endpoint(); + let engine_id = b"engine-openai-hidden-reasoning-logprobs".to_vec(); + + let engine_task = MockEngineTask::new(spawn_mock_engine_task( + handshake_address.clone(), + engine_id.clone(), + |dealer, push| { + boxed_test_future(async move { + let add = recv_engine_message(dealer).await; + let request: EngineCoreRequest = + rmp_serde::from_slice(&add[1]).expect("decode request"); + let reasoning_token_ids = bytes_to_token_ids(b"think"); + let answer_token_ids = bytes_to_token_ids(b"answer"); + + send_outputs( + push, + EngineCoreOutputs { + engine_index: 0, + outputs: vec![ + request_output_with_logprobs( + &request.request_id, + reasoning_token_ids.clone(), + None, + None, + Some(sample_logprobs_for_tokens(&reasoning_token_ids)), + None, + ), + request_output_with_logprobs( + &request.request_id, + answer_token_ids.clone(), + Some(EngineCoreFinishReason::Length), + None, + Some(sample_logprobs_for_tokens(&answer_token_ids)), + None, + ), + ], + scheduler_stats: None, + timestamp: 0.0, + utility_output: None, + finished_requests: None, + wave_complete: None, + start_wave: None, + }, + ) + .await; + }) + }, + )); + + let client = EngineCoreClient::connect( + EngineCoreClientConfig::new_single(handshake_address) + .with_model_name("test-model") + .with_local_input_output_addresses( + Some(ipc.input_endpoint()), + Some(ipc.output_endpoint()), + ), + ) + .await + .expect("connect client"); + let chat = ChatLlm::from_shared_backend( + test_llm(client), + Arc::new(FakeChatBackend::with_model_id("Qwen/Qwen3-0.6B")), + ); + let mut app = build_router(Arc::new(AppState::new( + vec!["Qwen/Qwen1.5-0.5B-Chat".to_string()], + chat, + ))); + + let response = app + .call( + Request::builder() + .method("POST") + .uri("/v1/chat/completions") + .header("content-type", "application/json") + .body(Body::from( + json!({ + "model": "Qwen/Qwen1.5-0.5B-Chat", + "stream": false, + "include_reasoning": false, + "logprobs": true, + "return_token_ids": true, + "messages": [{"role": "user", "content": "hello"}] + }) + .to_string(), + )) + .expect("build request"), + ) + .await + .expect("call app"); + + assert_eq!(response.status(), StatusCode::OK); + let body = to_bytes(response.into_body(), usize::MAX).await.expect("read body"); + engine_task.await.expect("mock engine task"); + let text = String::from_utf8(body.to_vec()).expect("utf8 body"); + let json: serde_json::Value = serde_json::from_str(&text).expect("decode json"); + let choice = json["choices"][0].as_object().expect("choice object"); + + assert_eq!(json["choices"][0]["message"]["content"], "answer"); + assert!( + json["choices"][0]["message"] + .as_object() + .is_some_and(|message| !message.contains_key("reasoning")), + "{text}" + ); + assert!(!choice.contains_key("logprobs"), "{text}"); + assert!(!choice.contains_key("token_ids"), "{text}"); + assert!(json["prompt_token_ids"].is_array(), "{text}"); +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] #[serial] async fn tool_calls_are_mapped_to_tool_call_sse_chunks() {