mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Rust Frontend] Support include_reasoning=false (#44391)
Signed-off-by: RickyChen / 陳昭儒 <ricky.chen@infinirc.com>
This commit is contained in:
@@ -85,6 +85,7 @@ pub async fn chat_completions(
|
||||
log_request,
|
||||
prepared.include_usage,
|
||||
prepared.requested_logprobs,
|
||||
prepared.include_reasoning,
|
||||
prepared.echo,
|
||||
prepared.return_token_ids,
|
||||
prepared.return_tokens_as_token_ids,
|
||||
@@ -100,6 +101,7 @@ pub async fn chat_completions(
|
||||
created,
|
||||
prepared.requested_logprobs,
|
||||
prepared.include_prompt_logprobs,
|
||||
prepared.include_reasoning,
|
||||
prepared.echo,
|
||||
prepared.return_token_ids,
|
||||
prepared.return_tokens_as_token_ids,
|
||||
@@ -134,6 +136,7 @@ async fn collect_chat_completion(
|
||||
created: u64,
|
||||
requested_logprobs: bool,
|
||||
include_prompt_logprobs: bool,
|
||||
include_reasoning: bool,
|
||||
echo: Option<String>,
|
||||
return_token_ids: bool,
|
||||
return_tokens_as_token_ids: bool,
|
||||
@@ -157,6 +160,11 @@ async fn collect_chat_completion(
|
||||
} = collected;
|
||||
let stop_reason = finish_reason.as_stop_reason().map(stop_reason_to_json);
|
||||
let saw_tool_calls = message.tool_calls().next().is_some();
|
||||
let reasoning = message.reasoning();
|
||||
// Output logprobs and token IDs cover the complete generated token stream.
|
||||
// When reasoning is hidden, omit them rather than leaking hidden reasoning
|
||||
// tokens through per-token metadata.
|
||||
let include_output_metadata = include_reasoning || reasoning.is_none();
|
||||
let finish_reason = chat_finish_reason_to_openai(&finish_reason, saw_tool_calls)?.to_string();
|
||||
let tool_calls = message
|
||||
.tool_calls()
|
||||
@@ -169,7 +177,7 @@ async fn collect_chat_completion(
|
||||
},
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let logprobs = if requested_logprobs {
|
||||
let logprobs = if requested_logprobs && include_output_metadata {
|
||||
Some(decoded_logprobs_to_openai_chat(
|
||||
logprobs.as_ref().ok_or_else(|| {
|
||||
server_error!("chat response requested logprobs but generation returned none")
|
||||
@@ -207,12 +215,12 @@ async fn collect_chat_completion(
|
||||
None => Some(message.text()).filter(|t| !t.is_empty()),
|
||||
},
|
||||
tool_calls: Some(tool_calls).filter(|calls| !calls.is_empty()),
|
||||
reasoning: message.reasoning(),
|
||||
reasoning: if include_reasoning { reasoning } else { None },
|
||||
},
|
||||
logprobs,
|
||||
finish_reason: Some(finish_reason),
|
||||
stop_reason,
|
||||
token_ids: return_token_ids.then_some(token_ids),
|
||||
token_ids: (return_token_ids && include_output_metadata).then_some(token_ids),
|
||||
}],
|
||||
usage: Some(usage),
|
||||
system_fingerprint: None,
|
||||
@@ -232,12 +240,18 @@ async fn chat_completion_chunk_stream(
|
||||
log_request: bool,
|
||||
include_usage: bool,
|
||||
requested_logprobs: bool,
|
||||
include_reasoning: bool,
|
||||
echo: Option<String>,
|
||||
return_token_ids: bool,
|
||||
return_tokens_as_token_ids: bool,
|
||||
mut y: TryYielder<ChatCompletionStreamResponse, ApiError>,
|
||||
) -> Result<(), ApiError> {
|
||||
let mut saw_tool_calls = false;
|
||||
// `LogprobsDelta` is emitted after all chat events for one decoded update.
|
||||
// If that update contains hidden reasoning, including delimiter-only block
|
||||
// starts or ends, omit its token metadata as well as its visible delta.
|
||||
let mut inside_hidden_reasoning = false;
|
||||
let mut suppress_current_update_metadata = false;
|
||||
|
||||
// If the client requested logprobs or token_ids, we need to buffer chunks until
|
||||
// we receive the separate `LogprobsDelta` event, so that we can emit one
|
||||
@@ -268,29 +282,44 @@ async fn chat_completion_chunk_stream(
|
||||
}
|
||||
}
|
||||
Ok(ChatEvent::BlockDelta { kind, delta, .. }) => {
|
||||
if let Some(pending_chunk) = pending_chunk.as_mut() {
|
||||
pending_chunk.push_block_delta(kind, delta);
|
||||
let include_delta =
|
||||
include_reasoning || !matches!(kind, AssistantBlockKind::Reasoning);
|
||||
if include_delta {
|
||||
if let Some(pending_chunk) = pending_chunk.as_mut() {
|
||||
pending_chunk.push_block_delta(kind, delta);
|
||||
} else {
|
||||
y.yield_ok(block_delta_chunk(
|
||||
&request_id,
|
||||
&response_model,
|
||||
created,
|
||||
kind,
|
||||
delta,
|
||||
))
|
||||
.await;
|
||||
}
|
||||
} else {
|
||||
y.yield_ok(block_delta_chunk(
|
||||
&request_id,
|
||||
&response_model,
|
||||
created,
|
||||
kind,
|
||||
delta,
|
||||
))
|
||||
.await;
|
||||
suppress_current_update_metadata = true;
|
||||
}
|
||||
}
|
||||
Ok(ChatEvent::LogprobsDelta {
|
||||
logprobs,
|
||||
token_ids,
|
||||
}) => {
|
||||
let openai_logprobs = logprobs
|
||||
.as_ref()
|
||||
.map(|lp| decoded_logprobs_to_openai_chat(lp, return_tokens_as_token_ids))
|
||||
.transpose()?;
|
||||
let openai_token_ids =
|
||||
return_token_ids.then_some(token_ids).filter(|t| !t.is_empty());
|
||||
let include_metadata =
|
||||
!suppress_current_update_metadata && !inside_hidden_reasoning;
|
||||
suppress_current_update_metadata = false;
|
||||
let openai_logprobs = if include_metadata {
|
||||
logprobs
|
||||
.as_ref()
|
||||
.map(|lp| decoded_logprobs_to_openai_chat(lp, return_tokens_as_token_ids))
|
||||
.transpose()?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let openai_token_ids = include_metadata
|
||||
.then_some(token_ids)
|
||||
.and_then(|token_ids| return_token_ids.then_some(token_ids))
|
||||
.filter(|t| !t.is_empty());
|
||||
if let Some(pending_chunk) = pending_chunk.as_mut() {
|
||||
pending_chunk.logprobs = openai_logprobs;
|
||||
pending_chunk.token_ids = openai_token_ids;
|
||||
@@ -311,9 +340,17 @@ async fn chat_completion_chunk_stream(
|
||||
}
|
||||
Ok(ChatEvent::BlockStart { kind, .. }) => {
|
||||
debug!(?kind, "starting new block");
|
||||
if !include_reasoning && matches!(kind, AssistantBlockKind::Reasoning) {
|
||||
inside_hidden_reasoning = true;
|
||||
suppress_current_update_metadata = true;
|
||||
}
|
||||
}
|
||||
Ok(ChatEvent::BlockEnd { .. }) => {
|
||||
debug!("ending current block");
|
||||
if inside_hidden_reasoning {
|
||||
inside_hidden_reasoning = false;
|
||||
suppress_current_update_metadata = true;
|
||||
}
|
||||
}
|
||||
Ok(ChatEvent::ToolCallStart { index, id, name }) => {
|
||||
let tool_index = index as u32;
|
||||
@@ -763,7 +800,9 @@ fn stop_reason_to_json(stop_reason: &StopReason) -> Value {
|
||||
mod tests {
|
||||
use futures::{StreamExt as _, stream};
|
||||
use serde_json::json;
|
||||
use vllm_chat::{AssistantBlockKind, AssistantToolCall, ChatEvent, FinishReason};
|
||||
use vllm_chat::{
|
||||
AssistantBlockKind, AssistantContentBlock, AssistantToolCall, ChatEvent, FinishReason,
|
||||
};
|
||||
use vllm_engine_core_client::protocol::StopReason;
|
||||
use vllm_text::{DecodedLogprobs, DecodedPositionLogprobs, DecodedTokenLogprob};
|
||||
|
||||
@@ -895,6 +934,7 @@ mod tests {
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
true,
|
||||
None,
|
||||
false,
|
||||
false,
|
||||
@@ -958,6 +998,7 @@ mod tests {
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
true,
|
||||
None,
|
||||
false,
|
||||
false,
|
||||
@@ -976,6 +1017,292 @@ mod tests {
|
||||
assert!(chunks[1].choices[0].logprobs.is_some());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn chunk_stream_omits_reasoning_delta_when_disabled() {
|
||||
let stream = stream::iter(vec![
|
||||
Ok(ChatEvent::Start {
|
||||
prompt_token_ids: vec![].into(),
|
||||
prompt_logprobs: None,
|
||||
}),
|
||||
Ok(ChatEvent::BlockDelta {
|
||||
index: 0,
|
||||
kind: AssistantBlockKind::Reasoning,
|
||||
delta: "think".to_string(),
|
||||
}),
|
||||
Ok(ChatEvent::BlockDelta {
|
||||
index: 1,
|
||||
kind: AssistantBlockKind::Text,
|
||||
delta: "answer".to_string(),
|
||||
}),
|
||||
Ok(ChatEvent::Done {
|
||||
message: Default::default(),
|
||||
prompt_token_count: 1,
|
||||
output_token_count: 2,
|
||||
finish_reason: FinishReason::stop_eos(),
|
||||
kv_transfer_params: None,
|
||||
}),
|
||||
]);
|
||||
|
||||
let chunks = chat_completion_chunk_stream(
|
||||
stream,
|
||||
"chatcmpl-1".to_string(),
|
||||
"model".to_string(),
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
None,
|
||||
false,
|
||||
false,
|
||||
)
|
||||
.collect::<Vec<_>>()
|
||||
.await
|
||||
.into_iter()
|
||||
.collect::<Result<Vec<_>, _>>()
|
||||
.expect("stream chunks");
|
||||
|
||||
assert_eq!(chunks.len(), 3);
|
||||
assert_eq!(
|
||||
chunks[1].choices[0].delta.content.as_deref(),
|
||||
Some("answer")
|
||||
);
|
||||
assert!(
|
||||
chunks
|
||||
.iter()
|
||||
.all(|chunk| chunk.choices.iter().all(|choice| choice.delta.reasoning.is_none()))
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn chunk_stream_omits_logprobs_for_suppressed_reasoning() {
|
||||
let stream = stream::iter(vec![
|
||||
Ok(ChatEvent::Start {
|
||||
prompt_token_ids: vec![].into(),
|
||||
prompt_logprobs: None,
|
||||
}),
|
||||
Ok(ChatEvent::BlockDelta {
|
||||
index: 0,
|
||||
kind: AssistantBlockKind::Reasoning,
|
||||
delta: "think".to_string(),
|
||||
}),
|
||||
Ok(ChatEvent::LogprobsDelta {
|
||||
logprobs: Some(DecodedLogprobs {
|
||||
positions: vec![DecodedPositionLogprobs {
|
||||
entries: vec![DecodedTokenLogprob {
|
||||
token_id: 11,
|
||||
token: "think".to_string(),
|
||||
logprob: -0.1,
|
||||
rank: 1,
|
||||
}],
|
||||
}],
|
||||
}),
|
||||
token_ids: vec![11],
|
||||
}),
|
||||
Ok(ChatEvent::BlockDelta {
|
||||
index: 1,
|
||||
kind: AssistantBlockKind::Text,
|
||||
delta: "answer".to_string(),
|
||||
}),
|
||||
Ok(ChatEvent::LogprobsDelta {
|
||||
logprobs: Some(DecodedLogprobs {
|
||||
positions: vec![DecodedPositionLogprobs {
|
||||
entries: vec![DecodedTokenLogprob {
|
||||
token_id: 22,
|
||||
token: "answer".to_string(),
|
||||
logprob: -0.2,
|
||||
rank: 1,
|
||||
}],
|
||||
}],
|
||||
}),
|
||||
token_ids: vec![22],
|
||||
}),
|
||||
Ok(ChatEvent::Done {
|
||||
message: Default::default(),
|
||||
prompt_token_count: 1,
|
||||
output_token_count: 2,
|
||||
finish_reason: FinishReason::stop_eos(),
|
||||
kv_transfer_params: None,
|
||||
}),
|
||||
]);
|
||||
|
||||
let chunks = chat_completion_chunk_stream(
|
||||
stream,
|
||||
"chatcmpl-1".to_string(),
|
||||
"model".to_string(),
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
None,
|
||||
true,
|
||||
false,
|
||||
)
|
||||
.collect::<Vec<_>>()
|
||||
.await
|
||||
.into_iter()
|
||||
.collect::<Result<Vec<_>, _>>()
|
||||
.expect("stream chunks");
|
||||
|
||||
assert_eq!(chunks.len(), 3);
|
||||
let choice = &chunks[1].choices[0];
|
||||
assert_eq!(choice.delta.content.as_deref(), Some("answer"));
|
||||
assert_eq!(choice.token_ids.as_deref(), Some(&[22][..]));
|
||||
let logprobs = choice.logprobs.as_ref().expect("answer logprobs");
|
||||
let content = logprobs.content.as_ref().expect("logprobs content");
|
||||
assert_eq!(content[0].token, "answer");
|
||||
assert!(chunks.iter().all(|chunk| {
|
||||
chunk.choices.iter().all(|choice| {
|
||||
choice.delta.reasoning.is_none()
|
||||
&& choice.token_ids.as_deref() != Some(&[11][..])
|
||||
&& choice
|
||||
.logprobs
|
||||
.as_ref()
|
||||
.and_then(|logprobs| logprobs.content.as_ref())
|
||||
.is_none_or(|content| content.iter().all(|entry| entry.token != "think"))
|
||||
})
|
||||
}));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn chunk_stream_omits_logprobs_for_hidden_reasoning_delimiters() {
|
||||
let stream = stream::iter(vec![
|
||||
Ok(ChatEvent::Start {
|
||||
prompt_token_ids: vec![].into(),
|
||||
prompt_logprobs: None,
|
||||
}),
|
||||
Ok(ChatEvent::BlockStart {
|
||||
index: 0,
|
||||
kind: AssistantBlockKind::Reasoning,
|
||||
}),
|
||||
Ok(ChatEvent::LogprobsDelta {
|
||||
logprobs: Some(DecodedLogprobs {
|
||||
positions: vec![DecodedPositionLogprobs {
|
||||
entries: vec![DecodedTokenLogprob {
|
||||
token_id: 11,
|
||||
token: "<think>".to_string(),
|
||||
logprob: -0.1,
|
||||
rank: 1,
|
||||
}],
|
||||
}],
|
||||
}),
|
||||
token_ids: vec![11],
|
||||
}),
|
||||
Ok(ChatEvent::BlockDelta {
|
||||
index: 0,
|
||||
kind: AssistantBlockKind::Reasoning,
|
||||
delta: "think".to_string(),
|
||||
}),
|
||||
Ok(ChatEvent::LogprobsDelta {
|
||||
logprobs: Some(DecodedLogprobs {
|
||||
positions: vec![DecodedPositionLogprobs {
|
||||
entries: vec![DecodedTokenLogprob {
|
||||
token_id: 12,
|
||||
token: "think".to_string(),
|
||||
logprob: -0.2,
|
||||
rank: 1,
|
||||
}],
|
||||
}],
|
||||
}),
|
||||
token_ids: vec![12],
|
||||
}),
|
||||
Ok(ChatEvent::BlockEnd {
|
||||
index: 0,
|
||||
block: AssistantContentBlock::Reasoning {
|
||||
text: "think".to_string(),
|
||||
},
|
||||
}),
|
||||
Ok(ChatEvent::LogprobsDelta {
|
||||
logprobs: Some(DecodedLogprobs {
|
||||
positions: vec![DecodedPositionLogprobs {
|
||||
entries: vec![DecodedTokenLogprob {
|
||||
token_id: 13,
|
||||
token: "</think>".to_string(),
|
||||
logprob: -0.3,
|
||||
rank: 1,
|
||||
}],
|
||||
}],
|
||||
}),
|
||||
token_ids: vec![13],
|
||||
}),
|
||||
Ok(ChatEvent::BlockStart {
|
||||
index: 1,
|
||||
kind: AssistantBlockKind::Text,
|
||||
}),
|
||||
Ok(ChatEvent::BlockDelta {
|
||||
index: 1,
|
||||
kind: AssistantBlockKind::Text,
|
||||
delta: "answer".to_string(),
|
||||
}),
|
||||
Ok(ChatEvent::LogprobsDelta {
|
||||
logprobs: Some(DecodedLogprobs {
|
||||
positions: vec![DecodedPositionLogprobs {
|
||||
entries: vec![DecodedTokenLogprob {
|
||||
token_id: 22,
|
||||
token: "answer".to_string(),
|
||||
logprob: -0.4,
|
||||
rank: 1,
|
||||
}],
|
||||
}],
|
||||
}),
|
||||
token_ids: vec![22],
|
||||
}),
|
||||
Ok(ChatEvent::Done {
|
||||
message: Default::default(),
|
||||
prompt_token_count: 1,
|
||||
output_token_count: 4,
|
||||
finish_reason: FinishReason::stop_eos(),
|
||||
kv_transfer_params: None,
|
||||
}),
|
||||
]);
|
||||
|
||||
let chunks = chat_completion_chunk_stream(
|
||||
stream,
|
||||
"chatcmpl-1".to_string(),
|
||||
"model".to_string(),
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
None,
|
||||
true,
|
||||
false,
|
||||
)
|
||||
.collect::<Vec<_>>()
|
||||
.await
|
||||
.into_iter()
|
||||
.collect::<Result<Vec<_>, _>>()
|
||||
.expect("stream chunks");
|
||||
|
||||
assert_eq!(chunks.len(), 3);
|
||||
let choice = &chunks[1].choices[0];
|
||||
assert_eq!(choice.delta.content.as_deref(), Some("answer"));
|
||||
assert_eq!(choice.token_ids.as_deref(), Some(&[22][..]));
|
||||
let logprobs = choice.logprobs.as_ref().expect("answer logprobs");
|
||||
let content = logprobs.content.as_ref().expect("logprobs content");
|
||||
assert_eq!(content[0].token, "answer");
|
||||
assert!(chunks.iter().all(|chunk| {
|
||||
chunk.choices.iter().all(|choice| {
|
||||
choice.delta.reasoning.is_none()
|
||||
&& !choice
|
||||
.token_ids
|
||||
.as_ref()
|
||||
.is_some_and(|ids| matches!(ids.as_slice(), [11] | [12] | [13]))
|
||||
&& choice
|
||||
.logprobs
|
||||
.as_ref()
|
||||
.and_then(|logprobs| logprobs.content.as_ref())
|
||||
.is_none_or(|content| {
|
||||
content.iter().all(|entry| {
|
||||
!matches!(entry.token.as_str(), "<think>" | "think" | "</think>")
|
||||
})
|
||||
})
|
||||
})
|
||||
}));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn chunk_stream_preserves_tool_call_index_and_omits_id_from_arguments_delta() {
|
||||
let stream = stream::iter(vec![
|
||||
@@ -1017,6 +1344,7 @@ mod tests {
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
None,
|
||||
false,
|
||||
false,
|
||||
|
||||
@@ -29,6 +29,8 @@ pub struct PreparedRequest {
|
||||
pub requested_logprobs: bool,
|
||||
/// Whether the caller requested top-level prompt logprobs.
|
||||
pub include_prompt_logprobs: bool,
|
||||
/// Whether to include reasoning content in OpenAI responses.
|
||||
pub include_reasoning: bool,
|
||||
/// Lowered chat request for `vllm-chat`.
|
||||
pub chat_request: ChatRequest,
|
||||
/// Last assistant-role message content to echo back when `echo=true`.
|
||||
@@ -57,6 +59,7 @@ pub(crate) fn prepare_chat_request(
|
||||
.as_ref()
|
||||
.map(|request| request.lora_name.clone())
|
||||
.unwrap_or_else(|| lora_resolution.model_names.first().cloned().unwrap_or_default());
|
||||
let include_reasoning = request.include_reasoning;
|
||||
let echo = request
|
||||
.echo
|
||||
.then(|| extract_last_assistant_content(&request.messages))
|
||||
@@ -146,6 +149,7 @@ pub(crate) fn prepare_chat_request(
|
||||
include_usage,
|
||||
requested_logprobs,
|
||||
include_prompt_logprobs,
|
||||
include_reasoning,
|
||||
chat_request,
|
||||
echo,
|
||||
return_token_ids: request.return_token_ids.unwrap_or(false),
|
||||
@@ -480,6 +484,23 @@ mod tests {
|
||||
assert_eq!(prepared.chat_request.tool_choice, ChatToolChoice::Auto);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prepare_chat_request_preserves_include_reasoning_false() {
|
||||
let request = ChatCompletionRequest {
|
||||
include_reasoning: false,
|
||||
..base_request()
|
||||
};
|
||||
|
||||
let prepared = prepare_chat_request(
|
||||
request,
|
||||
&served(&["Qwen/Qwen1.5-0.5B-Chat"]),
|
||||
ResolvedRequestContext::default(),
|
||||
)
|
||||
.expect("request is valid");
|
||||
|
||||
assert!(!prepared.include_reasoning);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prepare_chat_request_preserves_sampling_passthrough_fields() {
|
||||
let request = ChatCompletionRequest {
|
||||
|
||||
@@ -120,12 +120,6 @@ pub(super) fn validate_request_compat(
|
||||
"thinking_token_budget",
|
||||
"thinking_token_budget is not supported.",
|
||||
)?;
|
||||
if !request.include_reasoning {
|
||||
bail_invalid_request!(
|
||||
param = "include_reasoning",
|
||||
"include_reasoning is not supported."
|
||||
);
|
||||
}
|
||||
reject_non_default(
|
||||
request.media_io_kwargs.as_ref(),
|
||||
"media_io_kwargs",
|
||||
@@ -312,6 +306,17 @@ mod tests {
|
||||
.expect("reasoning_effort should be accepted");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_request_compat_accepts_include_reasoning_false() {
|
||||
let request = ChatCompletionRequest {
|
||||
include_reasoning: false,
|
||||
..base_request()
|
||||
};
|
||||
|
||||
validate_request_compat(&request, &served(&["Qwen/Qwen1.5-0.5B-Chat"]))
|
||||
.expect("include_reasoning=false should be accepted");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_request_compat_rejects_top_logprobs_without_logprobs() {
|
||||
let request = ChatCompletionRequest {
|
||||
|
||||
@@ -3492,6 +3492,170 @@ async fn reasoning_blocks_are_mapped_to_reasoning_sse_chunks() {
|
||||
assert!(text.contains("\"content\":\"answer\""), "{text}");
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
#[serial]
|
||||
async fn include_reasoning_false_suppresses_reasoning_in_non_stream_chat() {
|
||||
let (app, engine_task) = test_app_with_backend_and_stream_output_specs(
|
||||
Arc::new(FakeChatBackend::with_model_id("Qwen/Qwen3-0.6B")),
|
||||
vec![
|
||||
(bytes_to_token_ids(b"<think>think</think>"), None),
|
||||
(
|
||||
bytes_to_token_ids(b"answer"),
|
||||
Some(EngineCoreFinishReason::Length),
|
||||
),
|
||||
],
|
||||
)
|
||||
.await;
|
||||
|
||||
let response = app
|
||||
.clone()
|
||||
.call(
|
||||
Request::builder()
|
||||
.method("POST")
|
||||
.uri("/v1/chat/completions")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(
|
||||
json!({
|
||||
"model": "Qwen/Qwen1.5-0.5B-Chat",
|
||||
"stream": false,
|
||||
"include_reasoning": false,
|
||||
"messages": [{"role": "user", "content": "hello"}]
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.expect("build request"),
|
||||
)
|
||||
.await
|
||||
.expect("call app");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::OK);
|
||||
let body = to_bytes(response.into_body(), usize::MAX).await.expect("read body");
|
||||
engine_task.await.expect("mock engine task");
|
||||
let text = String::from_utf8(body.to_vec()).expect("utf8 body");
|
||||
let json: serde_json::Value = serde_json::from_str(&text).expect("decode json");
|
||||
|
||||
assert_eq!(json["choices"][0]["message"]["content"], "answer");
|
||||
assert!(
|
||||
json["choices"][0]["message"]
|
||||
.as_object()
|
||||
.is_some_and(|message| !message.contains_key("reasoning")),
|
||||
"{text}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
#[serial]
|
||||
async fn include_reasoning_false_suppresses_non_stream_output_metadata() {
|
||||
let ipc = IpcNamespace::new().expect("create ipc namespace");
|
||||
let handshake_address = ipc.handshake_endpoint();
|
||||
let engine_id = b"engine-openai-hidden-reasoning-logprobs".to_vec();
|
||||
|
||||
let engine_task = MockEngineTask::new(spawn_mock_engine_task(
|
||||
handshake_address.clone(),
|
||||
engine_id.clone(),
|
||||
|dealer, push| {
|
||||
boxed_test_future(async move {
|
||||
let add = recv_engine_message(dealer).await;
|
||||
let request: EngineCoreRequest =
|
||||
rmp_serde::from_slice(&add[1]).expect("decode request");
|
||||
let reasoning_token_ids = bytes_to_token_ids(b"<think>think</think>");
|
||||
let answer_token_ids = bytes_to_token_ids(b"answer");
|
||||
|
||||
send_outputs(
|
||||
push,
|
||||
EngineCoreOutputs {
|
||||
engine_index: 0,
|
||||
outputs: vec![
|
||||
request_output_with_logprobs(
|
||||
&request.request_id,
|
||||
reasoning_token_ids.clone(),
|
||||
None,
|
||||
None,
|
||||
Some(sample_logprobs_for_tokens(&reasoning_token_ids)),
|
||||
None,
|
||||
),
|
||||
request_output_with_logprobs(
|
||||
&request.request_id,
|
||||
answer_token_ids.clone(),
|
||||
Some(EngineCoreFinishReason::Length),
|
||||
None,
|
||||
Some(sample_logprobs_for_tokens(&answer_token_ids)),
|
||||
None,
|
||||
),
|
||||
],
|
||||
scheduler_stats: None,
|
||||
timestamp: 0.0,
|
||||
utility_output: None,
|
||||
finished_requests: None,
|
||||
wave_complete: None,
|
||||
start_wave: None,
|
||||
},
|
||||
)
|
||||
.await;
|
||||
})
|
||||
},
|
||||
));
|
||||
|
||||
let client = EngineCoreClient::connect(
|
||||
EngineCoreClientConfig::new_single(handshake_address)
|
||||
.with_model_name("test-model")
|
||||
.with_local_input_output_addresses(
|
||||
Some(ipc.input_endpoint()),
|
||||
Some(ipc.output_endpoint()),
|
||||
),
|
||||
)
|
||||
.await
|
||||
.expect("connect client");
|
||||
let chat = ChatLlm::from_shared_backend(
|
||||
test_llm(client),
|
||||
Arc::new(FakeChatBackend::with_model_id("Qwen/Qwen3-0.6B")),
|
||||
);
|
||||
let mut app = build_router(Arc::new(AppState::new(
|
||||
vec!["Qwen/Qwen1.5-0.5B-Chat".to_string()],
|
||||
chat,
|
||||
)));
|
||||
|
||||
let response = app
|
||||
.call(
|
||||
Request::builder()
|
||||
.method("POST")
|
||||
.uri("/v1/chat/completions")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(
|
||||
json!({
|
||||
"model": "Qwen/Qwen1.5-0.5B-Chat",
|
||||
"stream": false,
|
||||
"include_reasoning": false,
|
||||
"logprobs": true,
|
||||
"return_token_ids": true,
|
||||
"messages": [{"role": "user", "content": "hello"}]
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.expect("build request"),
|
||||
)
|
||||
.await
|
||||
.expect("call app");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::OK);
|
||||
let body = to_bytes(response.into_body(), usize::MAX).await.expect("read body");
|
||||
engine_task.await.expect("mock engine task");
|
||||
let text = String::from_utf8(body.to_vec()).expect("utf8 body");
|
||||
let json: serde_json::Value = serde_json::from_str(&text).expect("decode json");
|
||||
let choice = json["choices"][0].as_object().expect("choice object");
|
||||
|
||||
assert_eq!(json["choices"][0]["message"]["content"], "answer");
|
||||
assert!(
|
||||
json["choices"][0]["message"]
|
||||
.as_object()
|
||||
.is_some_and(|message| !message.contains_key("reasoning")),
|
||||
"{text}"
|
||||
);
|
||||
assert!(!choice.contains_key("logprobs"), "{text}");
|
||||
assert!(!choice.contains_key("token_ids"), "{text}");
|
||||
assert!(json["prompt_token_ids"].is_array(), "{text}");
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
#[serial]
|
||||
async fn tool_calls_are_mapped_to_tool_call_sse_chunks() {
|
||||
|
||||
Reference in New Issue
Block a user