[Rust Frontend] Support include_reasoning=false (#44391)

Signed-off-by: RickyChen / 陳昭儒 <ricky.chen@infinirc.com>
2026-06-06 00:16:14 +00:00 · 2026-06-05 16:47:50 +08:00
parent d61d8566ec
commit e64237ae82
4 changed files with 544 additions and 26 deletions
@@ -85,6 +85,7 @@ pub async fn chat_completions(
            log_request,
            prepared.include_usage,
            prepared.requested_logprobs,
+            prepared.include_reasoning,
            prepared.echo,
            prepared.return_token_ids,
            prepared.return_tokens_as_token_ids,
@@ -100,6 +101,7 @@ pub async fn chat_completions(
            created,
            prepared.requested_logprobs,
            prepared.include_prompt_logprobs,
+            prepared.include_reasoning,
            prepared.echo,
            prepared.return_token_ids,
            prepared.return_tokens_as_token_ids,
@@ -134,6 +136,7 @@ async fn collect_chat_completion(
    created: u64,
    requested_logprobs: bool,
    include_prompt_logprobs: bool,
+    include_reasoning: bool,
    echo: Option<String>,
    return_token_ids: bool,
    return_tokens_as_token_ids: bool,
@@ -157,6 +160,11 @@ async fn collect_chat_completion(
    } = collected;
    let stop_reason = finish_reason.as_stop_reason().map(stop_reason_to_json);
    let saw_tool_calls = message.tool_calls().next().is_some();
+    let reasoning = message.reasoning();
+    // Output logprobs and token IDs cover the complete generated token stream.
+    // When reasoning is hidden, omit them rather than leaking hidden reasoning
+    // tokens through per-token metadata.
+    let include_output_metadata = include_reasoning || reasoning.is_none();
    let finish_reason = chat_finish_reason_to_openai(&finish_reason, saw_tool_calls)?.to_string();
    let tool_calls = message
        .tool_calls()
@@ -169,7 +177,7 @@ async fn collect_chat_completion(
            },
        })
        .collect::<Vec<_>>();
-    let logprobs = if requested_logprobs {
+    let logprobs = if requested_logprobs && include_output_metadata {
        Some(decoded_logprobs_to_openai_chat(
            logprobs.as_ref().ok_or_else(|| {
                server_error!("chat response requested logprobs but generation returned none")
@@ -207,12 +215,12 @@ async fn collect_chat_completion(
                    None => Some(message.text()).filter(|t| !t.is_empty()),
                },
                tool_calls: Some(tool_calls).filter(|calls| !calls.is_empty()),
-                reasoning: message.reasoning(),
+                reasoning: if include_reasoning { reasoning } else { None },
            },
            logprobs,
            finish_reason: Some(finish_reason),
            stop_reason,
-            token_ids: return_token_ids.then_some(token_ids),
+            token_ids: (return_token_ids && include_output_metadata).then_some(token_ids),
        }],
        usage: Some(usage),
        system_fingerprint: None,
@@ -232,12 +240,18 @@ async fn chat_completion_chunk_stream(
    log_request: bool,
    include_usage: bool,
    requested_logprobs: bool,
+    include_reasoning: bool,
    echo: Option<String>,
    return_token_ids: bool,
    return_tokens_as_token_ids: bool,
    mut y: TryYielder<ChatCompletionStreamResponse, ApiError>,
 ) -> Result<(), ApiError> {
    let mut saw_tool_calls = false;
+    // `LogprobsDelta` is emitted after all chat events for one decoded update.
+    // If that update contains hidden reasoning, including delimiter-only block
+    // starts or ends, omit its token metadata as well as its visible delta.
+    let mut inside_hidden_reasoning = false;
+    let mut suppress_current_update_metadata = false;

    // If the client requested logprobs or token_ids, we need to buffer chunks until
    // we receive the separate `LogprobsDelta` event, so that we can emit one
@@ -268,29 +282,44 @@ async fn chat_completion_chunk_stream(
                }
            }
            Ok(ChatEvent::BlockDelta { kind, delta, .. }) => {
-                if let Some(pending_chunk) = pending_chunk.as_mut() {
-                    pending_chunk.push_block_delta(kind, delta);
+                let include_delta =
+                    include_reasoning || !matches!(kind, AssistantBlockKind::Reasoning);
+                if include_delta {
+                    if let Some(pending_chunk) = pending_chunk.as_mut() {
+                        pending_chunk.push_block_delta(kind, delta);
+                    } else {
+                        y.yield_ok(block_delta_chunk(
+                            &request_id,
+                            &response_model,
+                            created,
+                            kind,
+                            delta,
+                        ))
+                        .await;
+                    }
                } else {
-                    y.yield_ok(block_delta_chunk(
-                        &request_id,
-                        &response_model,
-                        created,
-                        kind,
-                        delta,
-                    ))
-                    .await;
+                    suppress_current_update_metadata = true;
                }
            }
            Ok(ChatEvent::LogprobsDelta {
                logprobs,
                token_ids,
            }) => {
-                let openai_logprobs = logprobs
-                    .as_ref()
-                    .map(|lp| decoded_logprobs_to_openai_chat(lp, return_tokens_as_token_ids))
-                    .transpose()?;
-                let openai_token_ids =
-                    return_token_ids.then_some(token_ids).filter(|t| !t.is_empty());
+                let include_metadata =
+                    !suppress_current_update_metadata && !inside_hidden_reasoning;
+                suppress_current_update_metadata = false;
+                let openai_logprobs = if include_metadata {
+                    logprobs
+                        .as_ref()
+                        .map(|lp| decoded_logprobs_to_openai_chat(lp, return_tokens_as_token_ids))
+                        .transpose()?
+                } else {
+                    None
+                };
+                let openai_token_ids = include_metadata
+                    .then_some(token_ids)
+                    .and_then(|token_ids| return_token_ids.then_some(token_ids))
+                    .filter(|t| !t.is_empty());
                if let Some(pending_chunk) = pending_chunk.as_mut() {
                    pending_chunk.logprobs = openai_logprobs;
                    pending_chunk.token_ids = openai_token_ids;
@@ -311,9 +340,17 @@ async fn chat_completion_chunk_stream(
            }
            Ok(ChatEvent::BlockStart { kind, .. }) => {
                debug!(?kind, "starting new block");
+                if !include_reasoning && matches!(kind, AssistantBlockKind::Reasoning) {
+                    inside_hidden_reasoning = true;
+                    suppress_current_update_metadata = true;
+                }
            }
            Ok(ChatEvent::BlockEnd { .. }) => {
                debug!("ending current block");
+                if inside_hidden_reasoning {
+                    inside_hidden_reasoning = false;
+                    suppress_current_update_metadata = true;
+                }
            }
            Ok(ChatEvent::ToolCallStart { index, id, name }) => {
                let tool_index = index as u32;
@@ -763,7 +800,9 @@ fn stop_reason_to_json(stop_reason: &StopReason) -> Value {
 mod tests {
    use futures::{StreamExt as _, stream};
    use serde_json::json;
-    use vllm_chat::{AssistantBlockKind, AssistantToolCall, ChatEvent, FinishReason};
+    use vllm_chat::{
+        AssistantBlockKind, AssistantContentBlock, AssistantToolCall, ChatEvent, FinishReason,
+    };
    use vllm_engine_core_client::protocol::StopReason;
    use vllm_text::{DecodedLogprobs, DecodedPositionLogprobs, DecodedTokenLogprob};

@@ -895,6 +934,7 @@ mod tests {
            false,
            false,
            true,
+            true,
            None,
            false,
            false,
@@ -958,6 +998,7 @@ mod tests {
            false,
            false,
            true,
+            true,
            None,
            false,
            false,
@@ -976,6 +1017,292 @@ mod tests {
        assert!(chunks[1].choices[0].logprobs.is_some());
    }

+    #[tokio::test]
+    async fn chunk_stream_omits_reasoning_delta_when_disabled() {
+        let stream = stream::iter(vec![
+            Ok(ChatEvent::Start {
+                prompt_token_ids: vec![].into(),
+                prompt_logprobs: None,
+            }),
+            Ok(ChatEvent::BlockDelta {
+                index: 0,
+                kind: AssistantBlockKind::Reasoning,
+                delta: "think".to_string(),
+            }),
+            Ok(ChatEvent::BlockDelta {
+                index: 1,
+                kind: AssistantBlockKind::Text,
+                delta: "answer".to_string(),
+            }),
+            Ok(ChatEvent::Done {
+                message: Default::default(),
+                prompt_token_count: 1,
+                output_token_count: 2,
+                finish_reason: FinishReason::stop_eos(),
+                kv_transfer_params: None,
+            }),
+        ]);
+
+        let chunks = chat_completion_chunk_stream(
+            stream,
+            "chatcmpl-1".to_string(),
+            "model".to_string(),
+            1,
+            false,
+            false,
+            false,
+            false,
+            None,
+            false,
+            false,
+        )
+        .collect::<Vec<_>>()
+        .await
+        .into_iter()
+        .collect::<Result<Vec<_>, _>>()
+        .expect("stream chunks");
+
+        assert_eq!(chunks.len(), 3);
+        assert_eq!(
+            chunks[1].choices[0].delta.content.as_deref(),
+            Some("answer")
+        );
+        assert!(
+            chunks
+                .iter()
+                .all(|chunk| chunk.choices.iter().all(|choice| choice.delta.reasoning.is_none()))
+        );
+    }
+
+    #[tokio::test]
+    async fn chunk_stream_omits_logprobs_for_suppressed_reasoning() {
+        let stream = stream::iter(vec![
+            Ok(ChatEvent::Start {
+                prompt_token_ids: vec![].into(),
+                prompt_logprobs: None,
+            }),
+            Ok(ChatEvent::BlockDelta {
+                index: 0,
+                kind: AssistantBlockKind::Reasoning,
+                delta: "think".to_string(),
+            }),
+            Ok(ChatEvent::LogprobsDelta {
+                logprobs: Some(DecodedLogprobs {
+                    positions: vec![DecodedPositionLogprobs {
+                        entries: vec![DecodedTokenLogprob {
+                            token_id: 11,
+                            token: "think".to_string(),
+                            logprob: -0.1,
+                            rank: 1,
+                        }],
+                    }],
+                }),
+                token_ids: vec![11],
+            }),
+            Ok(ChatEvent::BlockDelta {
+                index: 1,
+                kind: AssistantBlockKind::Text,
+                delta: "answer".to_string(),
+            }),
+            Ok(ChatEvent::LogprobsDelta {
+                logprobs: Some(DecodedLogprobs {
+                    positions: vec![DecodedPositionLogprobs {
+                        entries: vec![DecodedTokenLogprob {
+                            token_id: 22,
+                            token: "answer".to_string(),
+                            logprob: -0.2,
+                            rank: 1,
+                        }],
+                    }],
+                }),
+                token_ids: vec![22],
+            }),
+            Ok(ChatEvent::Done {
+                message: Default::default(),
+                prompt_token_count: 1,
+                output_token_count: 2,
+                finish_reason: FinishReason::stop_eos(),
+                kv_transfer_params: None,
+            }),
+        ]);
+
+        let chunks = chat_completion_chunk_stream(
+            stream,
+            "chatcmpl-1".to_string(),
+            "model".to_string(),
+            1,
+            false,
+            false,
+            true,
+            false,
+            None,
+            true,
+            false,
+        )
+        .collect::<Vec<_>>()
+        .await
+        .into_iter()
+        .collect::<Result<Vec<_>, _>>()
+        .expect("stream chunks");
+
+        assert_eq!(chunks.len(), 3);
+        let choice = &chunks[1].choices[0];
+        assert_eq!(choice.delta.content.as_deref(), Some("answer"));
+        assert_eq!(choice.token_ids.as_deref(), Some(&[22][..]));
+        let logprobs = choice.logprobs.as_ref().expect("answer logprobs");
+        let content = logprobs.content.as_ref().expect("logprobs content");
+        assert_eq!(content[0].token, "answer");
+        assert!(chunks.iter().all(|chunk| {
+            chunk.choices.iter().all(|choice| {
+                choice.delta.reasoning.is_none()
+                    && choice.token_ids.as_deref() != Some(&[11][..])
+                    && choice
+                        .logprobs
+                        .as_ref()
+                        .and_then(|logprobs| logprobs.content.as_ref())
+                        .is_none_or(|content| content.iter().all(|entry| entry.token != "think"))
+            })
+        }));
+    }
+
+    #[tokio::test]
+    async fn chunk_stream_omits_logprobs_for_hidden_reasoning_delimiters() {
+        let stream = stream::iter(vec![
+            Ok(ChatEvent::Start {
+                prompt_token_ids: vec![].into(),
+                prompt_logprobs: None,
+            }),
+            Ok(ChatEvent::BlockStart {
+                index: 0,
+                kind: AssistantBlockKind::Reasoning,
+            }),
+            Ok(ChatEvent::LogprobsDelta {
+                logprobs: Some(DecodedLogprobs {
+                    positions: vec![DecodedPositionLogprobs {
+                        entries: vec![DecodedTokenLogprob {
+                            token_id: 11,
+                            token: "<think>".to_string(),
+                            logprob: -0.1,
+                            rank: 1,
+                        }],
+                    }],
+                }),
+                token_ids: vec![11],
+            }),
+            Ok(ChatEvent::BlockDelta {
+                index: 0,
+                kind: AssistantBlockKind::Reasoning,
+                delta: "think".to_string(),
+            }),
+            Ok(ChatEvent::LogprobsDelta {
+                logprobs: Some(DecodedLogprobs {
+                    positions: vec![DecodedPositionLogprobs {
+                        entries: vec![DecodedTokenLogprob {
+                            token_id: 12,
+                            token: "think".to_string(),
+                            logprob: -0.2,
+                            rank: 1,
+                        }],
+                    }],
+                }),
+                token_ids: vec![12],
+            }),
+            Ok(ChatEvent::BlockEnd {
+                index: 0,
+                block: AssistantContentBlock::Reasoning {
+                    text: "think".to_string(),
+                },
+            }),
+            Ok(ChatEvent::LogprobsDelta {
+                logprobs: Some(DecodedLogprobs {
+                    positions: vec![DecodedPositionLogprobs {
+                        entries: vec![DecodedTokenLogprob {
+                            token_id: 13,
+                            token: "</think>".to_string(),
+                            logprob: -0.3,
+                            rank: 1,
+                        }],
+                    }],
+                }),
+                token_ids: vec![13],
+            }),
+            Ok(ChatEvent::BlockStart {
+                index: 1,
+                kind: AssistantBlockKind::Text,
+            }),
+            Ok(ChatEvent::BlockDelta {
+                index: 1,
+                kind: AssistantBlockKind::Text,
+                delta: "answer".to_string(),
+            }),
+            Ok(ChatEvent::LogprobsDelta {
+                logprobs: Some(DecodedLogprobs {
+                    positions: vec![DecodedPositionLogprobs {
+                        entries: vec![DecodedTokenLogprob {
+                            token_id: 22,
+                            token: "answer".to_string(),
+                            logprob: -0.4,
+                            rank: 1,
+                        }],
+                    }],
+                }),
+                token_ids: vec![22],
+            }),
+            Ok(ChatEvent::Done {
+                message: Default::default(),
+                prompt_token_count: 1,
+                output_token_count: 4,
+                finish_reason: FinishReason::stop_eos(),
+                kv_transfer_params: None,
+            }),
+        ]);
+
+        let chunks = chat_completion_chunk_stream(
+            stream,
+            "chatcmpl-1".to_string(),
+            "model".to_string(),
+            1,
+            false,
+            false,
+            true,
+            false,
+            None,
+            true,
+            false,
+        )
+        .collect::<Vec<_>>()
+        .await
+        .into_iter()
+        .collect::<Result<Vec<_>, _>>()
+        .expect("stream chunks");
+
+        assert_eq!(chunks.len(), 3);
+        let choice = &chunks[1].choices[0];
+        assert_eq!(choice.delta.content.as_deref(), Some("answer"));
+        assert_eq!(choice.token_ids.as_deref(), Some(&[22][..]));
+        let logprobs = choice.logprobs.as_ref().expect("answer logprobs");
+        let content = logprobs.content.as_ref().expect("logprobs content");
+        assert_eq!(content[0].token, "answer");
+        assert!(chunks.iter().all(|chunk| {
+            chunk.choices.iter().all(|choice| {
+                choice.delta.reasoning.is_none()
+                    && !choice
+                        .token_ids
+                        .as_ref()
+                        .is_some_and(|ids| matches!(ids.as_slice(), [11] | [12] | [13]))
+                    && choice
+                        .logprobs
+                        .as_ref()
+                        .and_then(|logprobs| logprobs.content.as_ref())
+                        .is_none_or(|content| {
+                            content.iter().all(|entry| {
+                                !matches!(entry.token.as_str(), "<think>" | "think" | "</think>")
+                            })
+                        })
+            })
+        }));
+    }
+
    #[tokio::test]
    async fn chunk_stream_preserves_tool_call_index_and_omits_id_from_arguments_delta() {
        let stream = stream::iter(vec![
@@ -1017,6 +1344,7 @@ mod tests {
            false,
            false,
            false,
+            true,
            None,
            false,
            false,
@@ -29,6 +29,8 @@ pub struct PreparedRequest {
    pub requested_logprobs: bool,
    /// Whether the caller requested top-level prompt logprobs.
    pub include_prompt_logprobs: bool,
+    /// Whether to include reasoning content in OpenAI responses.
+    pub include_reasoning: bool,
    /// Lowered chat request for `vllm-chat`.
    pub chat_request: ChatRequest,
    /// Last assistant-role message content to echo back when `echo=true`.
@@ -57,6 +59,7 @@ pub(crate) fn prepare_chat_request(
        .as_ref()
        .map(|request| request.lora_name.clone())
        .unwrap_or_else(|| lora_resolution.model_names.first().cloned().unwrap_or_default());
+    let include_reasoning = request.include_reasoning;
    let echo = request
        .echo
        .then(|| extract_last_assistant_content(&request.messages))
@@ -146,6 +149,7 @@ pub(crate) fn prepare_chat_request(
        include_usage,
        requested_logprobs,
        include_prompt_logprobs,
+        include_reasoning,
        chat_request,
        echo,
        return_token_ids: request.return_token_ids.unwrap_or(false),
@@ -480,6 +484,23 @@ mod tests {
        assert_eq!(prepared.chat_request.tool_choice, ChatToolChoice::Auto);
    }

+    #[test]
+    fn prepare_chat_request_preserves_include_reasoning_false() {
+        let request = ChatCompletionRequest {
+            include_reasoning: false,
+            ..base_request()
+        };
+
+        let prepared = prepare_chat_request(
+            request,
+            &served(&["Qwen/Qwen1.5-0.5B-Chat"]),
+            ResolvedRequestContext::default(),
+        )
+        .expect("request is valid");
+
+        assert!(!prepared.include_reasoning);
+    }
+
    #[test]
    fn prepare_chat_request_preserves_sampling_passthrough_fields() {
        let request = ChatCompletionRequest {
@@ -120,12 +120,6 @@ pub(super) fn validate_request_compat(
        "thinking_token_budget",
        "thinking_token_budget is not supported.",
    )?;
-    if !request.include_reasoning {
-        bail_invalid_request!(
-            param = "include_reasoning",
-            "include_reasoning is not supported."
-        );
-    }
    reject_non_default(
        request.media_io_kwargs.as_ref(),
        "media_io_kwargs",
@@ -312,6 +306,17 @@ mod tests {
            .expect("reasoning_effort should be accepted");
    }

+    #[test]
+    fn validate_request_compat_accepts_include_reasoning_false() {
+        let request = ChatCompletionRequest {
+            include_reasoning: false,
+            ..base_request()
+        };
+
+        validate_request_compat(&request, &served(&["Qwen/Qwen1.5-0.5B-Chat"]))
+            .expect("include_reasoning=false should be accepted");
+    }
+
    #[test]
    fn validate_request_compat_rejects_top_logprobs_without_logprobs() {
        let request = ChatCompletionRequest {
@@ -3492,6 +3492,170 @@ async fn reasoning_blocks_are_mapped_to_reasoning_sse_chunks() {
    assert!(text.contains("\"content\":\"answer\""), "{text}");
 }

+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+#[serial]
+async fn include_reasoning_false_suppresses_reasoning_in_non_stream_chat() {
+    let (app, engine_task) = test_app_with_backend_and_stream_output_specs(
+        Arc::new(FakeChatBackend::with_model_id("Qwen/Qwen3-0.6B")),
+        vec![
+            (bytes_to_token_ids(b"<think>think</think>"), None),
+            (
+                bytes_to_token_ids(b"answer"),
+                Some(EngineCoreFinishReason::Length),
+            ),
+        ],
+    )
+    .await;
+
+    let response = app
+        .clone()
+        .call(
+            Request::builder()
+                .method("POST")
+                .uri("/v1/chat/completions")
+                .header("content-type", "application/json")
+                .body(Body::from(
+                    json!({
+                        "model": "Qwen/Qwen1.5-0.5B-Chat",
+                        "stream": false,
+                        "include_reasoning": false,
+                        "messages": [{"role": "user", "content": "hello"}]
+                    })
+                    .to_string(),
+                ))
+                .expect("build request"),
+        )
+        .await
+        .expect("call app");
+
+    assert_eq!(response.status(), StatusCode::OK);
+    let body = to_bytes(response.into_body(), usize::MAX).await.expect("read body");
+    engine_task.await.expect("mock engine task");
+    let text = String::from_utf8(body.to_vec()).expect("utf8 body");
+    let json: serde_json::Value = serde_json::from_str(&text).expect("decode json");
+
+    assert_eq!(json["choices"][0]["message"]["content"], "answer");
+    assert!(
+        json["choices"][0]["message"]
+            .as_object()
+            .is_some_and(|message| !message.contains_key("reasoning")),
+        "{text}"
+    );
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+#[serial]
+async fn include_reasoning_false_suppresses_non_stream_output_metadata() {
+    let ipc = IpcNamespace::new().expect("create ipc namespace");
+    let handshake_address = ipc.handshake_endpoint();
+    let engine_id = b"engine-openai-hidden-reasoning-logprobs".to_vec();
+
+    let engine_task = MockEngineTask::new(spawn_mock_engine_task(
+        handshake_address.clone(),
+        engine_id.clone(),
+        |dealer, push| {
+            boxed_test_future(async move {
+                let add = recv_engine_message(dealer).await;
+                let request: EngineCoreRequest =
+                    rmp_serde::from_slice(&add[1]).expect("decode request");
+                let reasoning_token_ids = bytes_to_token_ids(b"<think>think</think>");
+                let answer_token_ids = bytes_to_token_ids(b"answer");
+
+                send_outputs(
+                    push,
+                    EngineCoreOutputs {
+                        engine_index: 0,
+                        outputs: vec![
+                            request_output_with_logprobs(
+                                &request.request_id,
+                                reasoning_token_ids.clone(),
+                                None,
+                                None,
+                                Some(sample_logprobs_for_tokens(&reasoning_token_ids)),
+                                None,
+                            ),
+                            request_output_with_logprobs(
+                                &request.request_id,
+                                answer_token_ids.clone(),
+                                Some(EngineCoreFinishReason::Length),
+                                None,
+                                Some(sample_logprobs_for_tokens(&answer_token_ids)),
+                                None,
+                            ),
+                        ],
+                        scheduler_stats: None,
+                        timestamp: 0.0,
+                        utility_output: None,
+                        finished_requests: None,
+                        wave_complete: None,
+                        start_wave: None,
+                    },
+                )
+                .await;
+            })
+        },
+    ));
+
+    let client = EngineCoreClient::connect(
+        EngineCoreClientConfig::new_single(handshake_address)
+            .with_model_name("test-model")
+            .with_local_input_output_addresses(
+                Some(ipc.input_endpoint()),
+                Some(ipc.output_endpoint()),
+            ),
+    )
+    .await
+    .expect("connect client");
+    let chat = ChatLlm::from_shared_backend(
+        test_llm(client),
+        Arc::new(FakeChatBackend::with_model_id("Qwen/Qwen3-0.6B")),
+    );
+    let mut app = build_router(Arc::new(AppState::new(
+        vec!["Qwen/Qwen1.5-0.5B-Chat".to_string()],
+        chat,
+    )));
+
+    let response = app
+        .call(
+            Request::builder()
+                .method("POST")
+                .uri("/v1/chat/completions")
+                .header("content-type", "application/json")
+                .body(Body::from(
+                    json!({
+                        "model": "Qwen/Qwen1.5-0.5B-Chat",
+                        "stream": false,
+                        "include_reasoning": false,
+                        "logprobs": true,
+                        "return_token_ids": true,
+                        "messages": [{"role": "user", "content": "hello"}]
+                    })
+                    .to_string(),
+                ))
+                .expect("build request"),
+        )
+        .await
+        .expect("call app");
+
+    assert_eq!(response.status(), StatusCode::OK);
+    let body = to_bytes(response.into_body(), usize::MAX).await.expect("read body");
+    engine_task.await.expect("mock engine task");
+    let text = String::from_utf8(body.to_vec()).expect("utf8 body");
+    let json: serde_json::Value = serde_json::from_str(&text).expect("decode json");
+    let choice = json["choices"][0].as_object().expect("choice object");
+
+    assert_eq!(json["choices"][0]["message"]["content"], "answer");
+    assert!(
+        json["choices"][0]["message"]
+            .as_object()
+            .is_some_and(|message| !message.contains_key("reasoning")),
+        "{text}"
+    );
+    assert!(!choice.contains_key("logprobs"), "{text}");
+    assert!(!choice.contains_key("token_ids"), "{text}");
+    assert!(json["prompt_token_ids"].is_array(), "{text}");
+}
+
 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
 #[serial]
 async fn tool_calls_are_mapped_to_tool_call_sse_chunks() {