mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Rust Frontend] Add /version endpoint using engine-reported value (#43854)
Signed-off-by: Bugen Zhao <i@bugenzhao.com>
This commit is contained in:
@@ -93,7 +93,7 @@ pub struct ChatLlm {
|
||||
text: TextLlm,
|
||||
backend: DynChatBackend,
|
||||
/// Effective model dtype reported by the engine.
|
||||
model_dtype: Option<ModelDtype>,
|
||||
model_dtype: ModelDtype,
|
||||
/// Tool-call parser selection.
|
||||
tool_call_parser: ParserSelection,
|
||||
/// Reasoning parser selection.
|
||||
@@ -135,7 +135,7 @@ impl ChatLlm {
|
||||
}
|
||||
|
||||
/// Override the effective model dtype used for multimodal tensor encoding.
|
||||
pub fn with_model_dtype(mut self, model_dtype: Option<ModelDtype>) -> Self {
|
||||
pub fn with_model_dtype(mut self, model_dtype: ModelDtype) -> Self {
|
||||
self.model_dtype = model_dtype;
|
||||
self
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, LazyLock, Once};
|
||||
use std::sync::{Arc, LazyLock};
|
||||
|
||||
use itertools::izip;
|
||||
use llm_multimodal::{
|
||||
@@ -239,7 +239,7 @@ pub(crate) async fn finalize_rendered_prompt(
|
||||
request: &ChatRequest,
|
||||
rendered: RenderedPrompt,
|
||||
info: Option<&MultimodalModelInfo>,
|
||||
model_dtype: Option<ModelDtype>,
|
||||
model_dtype: ModelDtype,
|
||||
) -> Result<(Prompt, Option<MmFeatures>)> {
|
||||
if !request.has_multimodal() {
|
||||
return Ok((rendered.prompt, None));
|
||||
@@ -249,16 +249,6 @@ pub(crate) async fn finalize_rendered_prompt(
|
||||
bail_multimodal!("multimodal chat renderer must return a text prompt before expansion");
|
||||
};
|
||||
let media_parts = extract_media_parts(request)?;
|
||||
let model_dtype = model_dtype.unwrap_or_else(|| {
|
||||
static WARN_ONCE: Once = Once::new();
|
||||
WARN_ONCE.call_once(|| {
|
||||
warn!(
|
||||
"engine handshake did not report model dtype; \
|
||||
falling back to float32 for multimodal tensor encoding"
|
||||
);
|
||||
});
|
||||
ModelDtype::Float32
|
||||
});
|
||||
|
||||
let mut prompt_token_ids = info
|
||||
.context
|
||||
|
||||
@@ -290,10 +290,8 @@ impl EngineCoreClient {
|
||||
|
||||
// If any engine reported a dp_stats_address in its ready response, use it
|
||||
// as the external coordinator address.
|
||||
let dp_stats_address: Option<String> = engines
|
||||
.iter()
|
||||
.filter_map(|e| e.ready_response.as_ref())
|
||||
.find_map(|r| r.dp_stats_address.clone());
|
||||
let dp_stats_address: Option<String> =
|
||||
engines.iter().find_map(|engine| engine.ready_response.dp_stats_address.clone());
|
||||
|
||||
let (coordinator, coordinator_output_task, coordinator_task) =
|
||||
if let Some(coordinator_transport) = connected.coordinator {
|
||||
@@ -368,40 +366,44 @@ impl EngineCoreClient {
|
||||
/// Return the ready responses received from all engines on the input
|
||||
/// socket.
|
||||
pub fn ready_responses(&self) -> Vec<&EngineCoreReadyResponse> {
|
||||
self.engines
|
||||
.iter()
|
||||
.filter_map(|engine| engine.ready_response.as_ref())
|
||||
.collect()
|
||||
self.engines.iter().map(|engine| &engine.ready_response).collect()
|
||||
}
|
||||
|
||||
/// Return the engine-reported effective model dtype, when available.
|
||||
pub fn model_dtype(&self) -> Option<ModelDtype> {
|
||||
/// Return the engine-reported effective model dtype.
|
||||
pub fn model_dtype(&self) -> ModelDtype {
|
||||
self.engines
|
||||
.iter()
|
||||
.filter_map(|engine| engine.ready_response.as_ref())
|
||||
.find_map(|response| response.dtype)
|
||||
.first()
|
||||
.expect("engine core client requires at least one engine")
|
||||
.ready_response
|
||||
.dtype
|
||||
}
|
||||
|
||||
/// Return the engine-reported Python vLLM version.
|
||||
pub fn vllm_version(&self) -> &str {
|
||||
self.engines
|
||||
.first()
|
||||
.expect("engine core client requires at least one engine")
|
||||
.ready_response
|
||||
.vllm_version
|
||||
.as_str()
|
||||
}
|
||||
|
||||
/// Return the total number of GPU blocks summed across all connected
|
||||
/// engines.
|
||||
pub fn total_num_gpu_blocks(&self) -> u64 {
|
||||
self.engines
|
||||
.iter()
|
||||
.filter_map(|engine| engine.ready_response.as_ref())
|
||||
.map(|r| r.num_gpu_blocks)
|
||||
.sum()
|
||||
self.engines.iter().map(|engine| engine.ready_response.num_gpu_blocks).sum()
|
||||
}
|
||||
|
||||
/// Return the minimum engine-reported `max_model_len` across all engines.
|
||||
///
|
||||
/// This is the auto-fitted value after KV cache profiling and may differ
|
||||
/// from the originally configured value.
|
||||
pub fn max_model_len(&self) -> Option<u32> {
|
||||
pub fn max_model_len(&self) -> u32 {
|
||||
self.engines
|
||||
.iter()
|
||||
.filter_map(|e| e.ready_response.as_ref())
|
||||
.map(|r| r.max_model_len as u32)
|
||||
.map(|engine| engine.ready_response.max_model_len as u32)
|
||||
.min()
|
||||
.expect("engine core client requires at least one engine")
|
||||
}
|
||||
|
||||
/// Get the model name associated with this client used for metrics
|
||||
|
||||
@@ -382,6 +382,7 @@ mod tests {
|
||||
use zeromq::{RouterSocket, Socket};
|
||||
|
||||
use super::*;
|
||||
use crate::mock_engine::default_ready_response;
|
||||
|
||||
async fn test_inner() -> ClientInner {
|
||||
let mut socket = RouterSocket::new();
|
||||
@@ -392,7 +393,7 @@ mod tests {
|
||||
"test-model".to_string(),
|
||||
&[ConnectedEngine {
|
||||
engine_id: EngineId::from(b"engine-0"),
|
||||
ready_response: None,
|
||||
ready_response: default_ready_response(),
|
||||
}],
|
||||
)
|
||||
}
|
||||
|
||||
@@ -339,15 +339,20 @@ mod tests {
|
||||
use super::{EngineRoutingState, RequestRegistry, UtilityRegistry};
|
||||
use crate::EngineId;
|
||||
use crate::client::state::EngineLoadSnapshot;
|
||||
use crate::mock_engine::default_ready_response;
|
||||
use crate::protocol::{EngineCoreFinishReason, EngineCoreOutput};
|
||||
use crate::transport::ConnectedEngine;
|
||||
|
||||
fn connected_engine(engine_id: EngineId) -> ConnectedEngine {
|
||||
ConnectedEngine {
|
||||
engine_id,
|
||||
ready_response: default_ready_response(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn registry_rejects_duplicate_request_ids() {
|
||||
let mut registry = RequestRegistry::new(&[ConnectedEngine {
|
||||
engine_id: EngineId::from(b"engine-0"),
|
||||
ready_response: None,
|
||||
}]);
|
||||
let mut registry = RequestRegistry::new(&[connected_engine(EngineId::from(b"engine-0"))]);
|
||||
registry.register("req-1".to_string(), None).unwrap();
|
||||
let error = registry.register("req-1".to_string(), None).unwrap_err();
|
||||
assert!(matches!(
|
||||
@@ -358,10 +363,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn registry_removes_finished_request_on_output() {
|
||||
let mut registry = RequestRegistry::new(&[ConnectedEngine {
|
||||
engine_id: EngineId::from(b"engine-0"),
|
||||
ready_response: None,
|
||||
}]);
|
||||
let mut registry = RequestRegistry::new(&[connected_engine(EngineId::from(b"engine-0"))]);
|
||||
registry.register("req-1".to_string(), None).unwrap();
|
||||
|
||||
let sender = registry.sender_for_output(&EngineCoreOutput {
|
||||
@@ -376,10 +378,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn registry_closes_all_requests_on_failure() {
|
||||
let mut registry = RequestRegistry::new(&[ConnectedEngine {
|
||||
engine_id: EngineId::from(b"engine-0"),
|
||||
ready_response: None,
|
||||
}]);
|
||||
let mut registry = RequestRegistry::new(&[connected_engine(EngineId::from(b"engine-0"))]);
|
||||
registry.register("req-1".to_string(), None).unwrap();
|
||||
registry.register("req-2".to_string(), None).unwrap();
|
||||
|
||||
@@ -394,14 +393,8 @@ mod tests {
|
||||
let engine_0 = EngineId::from_engine_index(0);
|
||||
let engine_1 = EngineId::from_engine_index(1);
|
||||
let mut registry = RequestRegistry::new(&[
|
||||
ConnectedEngine {
|
||||
engine_id: engine_0.clone(),
|
||||
ready_response: None,
|
||||
},
|
||||
ConnectedEngine {
|
||||
engine_id: engine_1.clone(),
|
||||
ready_response: None,
|
||||
},
|
||||
connected_engine(engine_0.clone()),
|
||||
connected_engine(engine_1.clone()),
|
||||
]);
|
||||
let (chosen_0, _) = registry.register("req-1".to_string(), None).unwrap();
|
||||
let (chosen_1, _) = registry.register("req-2".to_string(), None).unwrap();
|
||||
@@ -428,14 +421,8 @@ mod tests {
|
||||
let engine_0 = EngineId::from_engine_index(0);
|
||||
let engine_1 = EngineId::from_engine_index(1);
|
||||
let mut registry = RequestRegistry::new(&[
|
||||
ConnectedEngine {
|
||||
engine_id: engine_0.clone(),
|
||||
ready_response: None,
|
||||
},
|
||||
ConnectedEngine {
|
||||
engine_id: engine_1.clone(),
|
||||
ready_response: None,
|
||||
},
|
||||
connected_engine(engine_0.clone()),
|
||||
connected_engine(engine_1.clone()),
|
||||
]);
|
||||
|
||||
let (chosen_0, _) = registry.register("req-1".to_string(), None).unwrap();
|
||||
@@ -488,14 +475,8 @@ mod tests {
|
||||
let engine_0 = EngineId::from_engine_index(0);
|
||||
let engine_1 = EngineId::from_engine_index(1);
|
||||
let mut registry = RequestRegistry::new(&[
|
||||
ConnectedEngine {
|
||||
engine_id: engine_0.clone(),
|
||||
ready_response: None,
|
||||
},
|
||||
ConnectedEngine {
|
||||
engine_id: engine_1.clone(),
|
||||
ready_response: None,
|
||||
},
|
||||
connected_engine(engine_0.clone()),
|
||||
connected_engine(engine_1.clone()),
|
||||
]);
|
||||
|
||||
assert!(registry.apply_scheduler_counts(
|
||||
@@ -523,18 +504,9 @@ mod tests {
|
||||
let engine_1 = EngineId::from_engine_index(1);
|
||||
let engine_2 = EngineId::from_engine_index(2);
|
||||
let mut registry = RequestRegistry::new(&[
|
||||
ConnectedEngine {
|
||||
engine_id: engine_0.clone(),
|
||||
ready_response: None,
|
||||
},
|
||||
ConnectedEngine {
|
||||
engine_id: engine_1.clone(),
|
||||
ready_response: None,
|
||||
},
|
||||
ConnectedEngine {
|
||||
engine_id: engine_2.clone(),
|
||||
ready_response: None,
|
||||
},
|
||||
connected_engine(engine_0.clone()),
|
||||
connected_engine(engine_1.clone()),
|
||||
connected_engine(engine_2.clone()),
|
||||
]);
|
||||
|
||||
// Explicitly target rank 2 (third engine).
|
||||
@@ -555,14 +527,8 @@ mod tests {
|
||||
let engine_0 = EngineId::from_engine_index(0);
|
||||
let engine_1 = EngineId::from_engine_index(1);
|
||||
let mut registry = RequestRegistry::new(&[
|
||||
ConnectedEngine {
|
||||
engine_id: engine_0.clone(),
|
||||
ready_response: None,
|
||||
},
|
||||
ConnectedEngine {
|
||||
engine_id: engine_1.clone(),
|
||||
ready_response: None,
|
||||
},
|
||||
connected_engine(engine_0.clone()),
|
||||
connected_engine(engine_1.clone()),
|
||||
]);
|
||||
|
||||
// Load-balance: first two go to engine_0 and engine_1.
|
||||
@@ -577,14 +543,8 @@ mod tests {
|
||||
#[test]
|
||||
fn register_with_out_of_range_rank_returns_error() {
|
||||
let mut registry = RequestRegistry::new(&[
|
||||
ConnectedEngine {
|
||||
engine_id: EngineId::from_engine_index(0),
|
||||
ready_response: None,
|
||||
},
|
||||
ConnectedEngine {
|
||||
engine_id: EngineId::from_engine_index(1),
|
||||
ready_response: None,
|
||||
},
|
||||
connected_engine(EngineId::from_engine_index(0)),
|
||||
connected_engine(EngineId::from_engine_index(1)),
|
||||
]);
|
||||
|
||||
let error = registry.register("req-1".to_string(), Some(2)).unwrap_err();
|
||||
@@ -600,10 +560,7 @@ mod tests {
|
||||
#[test]
|
||||
fn register_with_rank_on_single_engine_only_accepts_zero() {
|
||||
let engine_0 = EngineId::from_engine_index(0);
|
||||
let mut registry = RequestRegistry::new(&[ConnectedEngine {
|
||||
engine_id: engine_0.clone(),
|
||||
ready_response: None,
|
||||
}]);
|
||||
let mut registry = RequestRegistry::new(&[connected_engine(engine_0.clone())]);
|
||||
|
||||
let (chosen, _) = registry.register("req-ok".to_string(), Some(0)).unwrap();
|
||||
assert_eq!(chosen, engine_0);
|
||||
|
||||
@@ -47,7 +47,8 @@ pub fn default_ready_response() -> EngineCoreReadyResponse {
|
||||
max_model_len: DEFAULT_MOCK_MAX_MODEL_LEN,
|
||||
num_gpu_blocks: DEFAULT_MOCK_NUM_GPU_BLOCKS,
|
||||
dp_stats_address: None,
|
||||
dtype: Some(ModelDtype::Float32),
|
||||
dtype: ModelDtype::Float32,
|
||||
vllm_version: "test-vllm-version".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,10 +39,9 @@ pub struct EngineCoreReadyResponse {
|
||||
/// DP coordinator stats publish address, if applicable.
|
||||
pub dp_stats_address: Option<String>,
|
||||
/// Effective model dtype after Python vLLM resolves `--dtype`.
|
||||
// TODO: This is currently not wired up on the engine side. After it's added, remove `Option`
|
||||
// and `serde(default)`.
|
||||
#[serde(default)]
|
||||
pub dtype: Option<ModelDtype>,
|
||||
pub dtype: ModelDtype,
|
||||
/// Python vLLM version reported by the engine process.
|
||||
pub vllm_version: String,
|
||||
}
|
||||
|
||||
/// Frontend-owned ZMQ addresses that are sent to the engine during startup
|
||||
@@ -69,22 +68,3 @@ pub struct HandshakeInitMessage {
|
||||
pub addresses: HandshakeAddresses,
|
||||
pub parallel_config: BTreeMap<String, OpaqueValue>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::EngineCoreReadyResponse;
|
||||
use crate::protocol::ModelDtype;
|
||||
|
||||
#[test]
|
||||
fn ready_response_accepts_effective_dtype() {
|
||||
let response: EngineCoreReadyResponse = serde_json::from_value(serde_json::json!({
|
||||
"max_model_len": 4096,
|
||||
"num_gpu_blocks": 2,
|
||||
"dp_stats_address": null,
|
||||
"dtype": "bfloat16"
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(response.dtype, Some(ModelDtype::BFloat16));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,9 +10,9 @@ use crate::EngineId;
|
||||
pub use crate::mock_engine::{MockCoordinatorSockets, MockEngineSockets};
|
||||
use crate::mock_engine::{
|
||||
MockEngineConfig, MockEngineDataSockets, connect_to_bootstrapped_frontend, connect_to_frontend,
|
||||
default_ready_response,
|
||||
};
|
||||
use crate::protocol::ModelDtype;
|
||||
use crate::protocol::handshake::{EngineCoreReadyResponse, HandshakeInitMessage};
|
||||
use crate::protocol::handshake::HandshakeInitMessage;
|
||||
|
||||
/// Per-test IPC endpoint namespace backed by a unique temporary directory.
|
||||
///
|
||||
@@ -57,12 +57,7 @@ fn test_mock_engine_config() -> MockEngineConfig {
|
||||
MockEngineConfig {
|
||||
local: true,
|
||||
headless: true,
|
||||
ready_response: EngineCoreReadyResponse {
|
||||
max_model_len: 4096,
|
||||
num_gpu_blocks: 0,
|
||||
dp_stats_address: None,
|
||||
dtype: Some(ModelDtype::Float32),
|
||||
},
|
||||
ready_response: default_ready_response(),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -925,6 +925,7 @@ async fn client_fail_closes_when_main_output_path_receives_dp_control() {
|
||||
.await;
|
||||
assert_eq!(client.engine_identities()[0], b"engine-0");
|
||||
assert!(client.ready_responses()[0].max_model_len > 0);
|
||||
assert_eq!(client.vllm_version(), "test-vllm-version");
|
||||
|
||||
let mut stream_1 = client.call(sample_request_with_id("req-1")).await.unwrap();
|
||||
let mut stream_2 = client.call(sample_request_with_id("req-2")).await.unwrap();
|
||||
|
||||
@@ -104,8 +104,8 @@ pub struct ConnectedEngine {
|
||||
/// The identity of the connected engine.
|
||||
pub engine_id: EngineId,
|
||||
/// Post-initialization configuration received from the engine on the input
|
||||
/// socket registration message. `None` until the registration is received.
|
||||
pub ready_response: Option<EngineCoreReadyResponse>,
|
||||
/// socket registration message.
|
||||
pub ready_response: EngineCoreReadyResponse,
|
||||
}
|
||||
|
||||
/// Represents the connected shared transport plus all registered engines after
|
||||
@@ -295,18 +295,9 @@ pub async fn connect_handshake(
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Wait for every engine to connect to the shared input socket and register itself. The
|
||||
// `ready_response` is a placeholder; it is populated for each engine by
|
||||
// `wait_for_input_registrations` below.
|
||||
let mut engines: Vec<_> = engines
|
||||
.into_keys()
|
||||
.map(|engine_id| ConnectedEngine {
|
||||
engine_id,
|
||||
ready_response: None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
wait_for_input_registrations(&mut input_socket, &mut engines, ready_timeout).await?;
|
||||
// 6. Wait for every engine to connect to the shared input socket and register itself.
|
||||
let engines =
|
||||
wait_for_input_registrations(&mut input_socket, engines.into_keys(), ready_timeout).await?;
|
||||
debug!(
|
||||
engine_count = engines.len(),
|
||||
"all engines registered on shared input socket"
|
||||
@@ -349,15 +340,13 @@ pub async fn connect_bootstrapped(
|
||||
let mut output_socket = PullSocket::new();
|
||||
let output_address = output_socket.bind(output_address).await?.to_string();
|
||||
|
||||
// TODO: follow start rank
|
||||
let mut engines = (0..engine_count)
|
||||
.map(|index| ConnectedEngine {
|
||||
engine_id: EngineId::from((index as u16).to_le_bytes().to_vec()),
|
||||
ready_response: None,
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
wait_for_input_registrations(&mut input_socket, &mut engines, ready_timeout).await?;
|
||||
let engines = wait_for_input_registrations(
|
||||
&mut input_socket,
|
||||
// TODO: follow start rank
|
||||
(0..engine_count).map(|index| EngineId::from((index as u16).to_le_bytes().to_vec())),
|
||||
ready_timeout,
|
||||
)
|
||||
.await?;
|
||||
info!(
|
||||
engine_count = engines.len(),
|
||||
"bootstrapped engines connected"
|
||||
@@ -455,17 +444,14 @@ async fn send_init_message(
|
||||
/// Simplify API server handshake"), the payload is a msgpack-encoded
|
||||
/// [`EngineCoreReadyResponse`] carrying post-initialization values such as
|
||||
/// `max_model_len`.
|
||||
///
|
||||
/// Older engines sent an empty second frame here just to establish the
|
||||
/// ROUTER/DEALER backchannel, with no structured payload on the input socket.
|
||||
/// We continue to tolerate that legacy shape so the frontend can still connect
|
||||
/// to slightly older local engine checkouts.
|
||||
async fn wait_for_input_registrations(
|
||||
input_socket: &mut RouterSocket,
|
||||
engines: &mut [ConnectedEngine],
|
||||
expected_engines: impl IntoIterator<Item = EngineId>,
|
||||
ready_timeout: Duration,
|
||||
) -> Result<()> {
|
||||
let mut pending = engines.iter().map(|e| e.engine_id.clone()).collect::<BTreeSet<_>>();
|
||||
) -> Result<Vec<ConnectedEngine>> {
|
||||
let expected_engines = expected_engines.into_iter().collect::<Vec<_>>();
|
||||
let mut pending = expected_engines.iter().cloned().collect::<BTreeSet<_>>();
|
||||
let mut ready_responses = BTreeMap::new();
|
||||
|
||||
while !pending.is_empty() {
|
||||
let registration = timeout(ready_timeout, input_socket.recv()).await.map_err(|_| {
|
||||
@@ -489,29 +475,33 @@ async fn wait_for_input_registrations(
|
||||
);
|
||||
}
|
||||
|
||||
let ready_response = if frames[1].is_empty() {
|
||||
debug!(
|
||||
?actual_id,
|
||||
"received legacy empty input registration from engine"
|
||||
if frames[1].is_empty() {
|
||||
bail_unexpected_handshake_message!(
|
||||
"expected msgpack EngineCoreReadyResponse for engine input registration, got empty payload from engine id {actual_id:?}"
|
||||
);
|
||||
None
|
||||
} else {
|
||||
let ready_response: EngineCoreReadyResponse = decode_msgpack(&frames[1])?;
|
||||
debug!(
|
||||
?actual_id,
|
||||
?ready_response,
|
||||
"received input registration from engine"
|
||||
);
|
||||
Some(ready_response)
|
||||
};
|
||||
|
||||
// Store the ready response in the corresponding engine entry.
|
||||
if let Some(engine) = engines.iter_mut().find(|e| e.engine_id == actual_id) {
|
||||
engine.ready_response = ready_response;
|
||||
}
|
||||
|
||||
let ready_response: EngineCoreReadyResponse = decode_msgpack(&frames[1])?;
|
||||
debug!(
|
||||
?actual_id,
|
||||
?ready_response,
|
||||
"received input registration from engine"
|
||||
);
|
||||
ready_responses.insert(actual_id, ready_response);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(expected_engines
|
||||
.into_iter()
|
||||
.map(|engine_id| {
|
||||
let ready_response = ready_responses
|
||||
.remove(&engine_id)
|
||||
.expect("every expected engine id has a decoded ready response");
|
||||
ConnectedEngine {
|
||||
engine_id,
|
||||
ready_response,
|
||||
}
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Send an encoded message to the engine through the input socket.
|
||||
|
||||
@@ -98,7 +98,8 @@ async fn mock_engine_connects_over_tcp() {
|
||||
let (client, shutdown, task) = connect_with_mock(handshake_address, 1, 1).await;
|
||||
assert_eq!(client.engine_count(), 1);
|
||||
assert_eq!(client.engine_identities()[0], &[0, 0]);
|
||||
assert_eq!(client.max_model_len(), Some(1024 * 1024));
|
||||
assert_eq!(client.max_model_len(), 1024 * 1024);
|
||||
assert_eq!(client.vllm_version(), "test-vllm-version");
|
||||
shutdown_mock(client, shutdown, task).await;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ mod load;
|
||||
mod metrics;
|
||||
pub(crate) mod openai;
|
||||
mod sleep;
|
||||
mod version;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -35,6 +36,7 @@ fn build_router_with_dev_mode(state: Arc<AppState>, dev_mode_enabled: bool) -> R
|
||||
.route("/health", get(health::health))
|
||||
.route("/metrics", get(metrics::scrape))
|
||||
.route("/load", get(load::load))
|
||||
.route("/version", get(version::version))
|
||||
// OpenAI-compatible endpoints
|
||||
.route("/v1/models", get(openai::list_models))
|
||||
.route("/v1/completions", post(openai::completions))
|
||||
|
||||
@@ -994,6 +994,27 @@ async fn list_models_returns_configured_model() {
|
||||
assert_eq!(json["data"][0]["id"], "Qwen/Qwen1.5-0.5B-Chat");
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
#[serial]
|
||||
async fn version_returns_engine_vllm_version() {
|
||||
let mut app = test_app().await;
|
||||
let response = app
|
||||
.call(Request::builder().uri("/version").body(Body::empty()).expect("build request"))
|
||||
.await
|
||||
.expect("call app");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::OK);
|
||||
let body = to_bytes(response.into_body(), usize::MAX).await.expect("read body");
|
||||
let json: serde_json::Value = serde_json::from_slice(&body).expect("decode json");
|
||||
assert_eq!(
|
||||
json,
|
||||
json!({
|
||||
"version": "test-vllm-version",
|
||||
"rust_frontend_version": env!("CARGO_PKG_VERSION"),
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
#[serial]
|
||||
async fn http_metrics_record_list_models_requests() {
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::Json;
|
||||
use axum::extract::State;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::state::AppState;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub(crate) struct VersionResponse {
|
||||
version: String,
|
||||
rust_frontend_version: &'static str,
|
||||
}
|
||||
|
||||
/// Get engine and Rust frontend version metadata.
|
||||
pub async fn version(State(state): State<Arc<AppState>>) -> Json<VersionResponse> {
|
||||
let version = state.engine_core_client().vllm_version().to_string();
|
||||
|
||||
Json(VersionResponse {
|
||||
version,
|
||||
rust_frontend_version: env!("CARGO_PKG_VERSION"),
|
||||
})
|
||||
}
|
||||
@@ -45,9 +45,9 @@ pub struct TextLlm {
|
||||
/// Tokenizer/model metadata backend responsible for prompt encode/decode
|
||||
/// and sampling hints.
|
||||
backend: DynTextBackend,
|
||||
/// Context window size derived by the backend or from engine startup
|
||||
/// handshake, with optional override from config.
|
||||
max_model_len: Option<u32>,
|
||||
/// Context window size reported by the engine startup handshake, with
|
||||
/// optional override from config.
|
||||
max_model_len: u32,
|
||||
}
|
||||
|
||||
impl TextLlm {
|
||||
@@ -71,7 +71,7 @@ impl TextLlm {
|
||||
/// This takes priority over both the engine-reported default and any
|
||||
/// tokenizer/model metadata exposed by the backend.
|
||||
pub fn with_max_model_len(mut self, max_model_len: u32) -> Self {
|
||||
self.max_model_len = Some(max_model_len);
|
||||
self.max_model_len = max_model_len;
|
||||
self
|
||||
}
|
||||
|
||||
@@ -129,9 +129,7 @@ impl TextLlm {
|
||||
};
|
||||
|
||||
let mut sampling_hints = self.backend.sampling_hints()?;
|
||||
if let Some(max_model_len) = self.max_model_len {
|
||||
sampling_hints.max_model_len = Some(max_model_len);
|
||||
}
|
||||
sampling_hints.max_model_len = Some(self.max_model_len);
|
||||
let PreparedTextRequest {
|
||||
text_request,
|
||||
generate_request,
|
||||
|
||||
@@ -75,7 +75,8 @@ class EngineCoreReadyResponse:
|
||||
max_model_len: int
|
||||
num_gpu_blocks: int
|
||||
dp_stats_address: str | None
|
||||
dtype: str | None = None
|
||||
dtype: str
|
||||
vllm_version: str
|
||||
|
||||
|
||||
class EngineCoreRequest(
|
||||
|
||||
@@ -1464,6 +1464,7 @@ class EngineCoreProc(EngineCore):
|
||||
num_gpu_blocks=self.vllm_config.cache_config.num_gpu_blocks or 0,
|
||||
dp_stats_address=self.frontend_stats_publish_address,
|
||||
dtype=str(self.vllm_config.model_config.dtype).removeprefix("torch."),
|
||||
vllm_version=VLLM_VERSION,
|
||||
)
|
||||
ready_payload = msgspec.msgpack.encode(ready_response)
|
||||
for input_socket in input_sockets:
|
||||
|
||||
Reference in New Issue
Block a user