mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Rust Frontend] Skip loading multimodal processor if --language-model-only is specified (#44500)
Signed-off-by: Bugen Zhao <i@bugenzhao.com>
This commit is contained in:
@@ -38,13 +38,17 @@ impl HfChatBackend {
|
||||
) -> Result<Self> {
|
||||
let model_config = load_model_config(files.config_path.as_deref())?;
|
||||
let model_type = model_config.model_type().unwrap_or_default();
|
||||
let multimodal_model_info = MultimodalModelInfo::from_paths(
|
||||
let multimodal_model_info = if options.language_model_only {
|
||||
None
|
||||
} else {
|
||||
MultimodalModelInfo::from_paths(
|
||||
model_id.clone(),
|
||||
(!model_type.is_empty()).then_some(model_type.to_string()),
|
||||
files.config_path.as_deref(),
|
||||
files.preprocessor_config_path.as_deref(),
|
||||
tokenizer.clone(),
|
||||
)?;
|
||||
)?
|
||||
};
|
||||
let multimodal_render_info = resolve_multimodal_render_info(multimodal_model_info.as_ref());
|
||||
|
||||
let renderer = options.renderer.resolve(model_type);
|
||||
@@ -225,6 +229,7 @@ mod tests {
|
||||
"test-model".to_string(),
|
||||
LoadModelBackendsOptions {
|
||||
renderer,
|
||||
language_model_only: false,
|
||||
chat_template_content_format: Default::default(),
|
||||
chat_template: None,
|
||||
default_chat_template_kwargs: HashMap::new(),
|
||||
@@ -267,6 +272,54 @@ mod tests {
|
||||
assert_eq!(prompt, "hello");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn language_model_only_skips_multimodal_preprocessor_config() {
|
||||
let mut files = resolved_files(
|
||||
r#"{"model_type":"deepseek_v0_vl"}"#,
|
||||
r#"{"chat_template":"{{ messages[0].content }}"}"#,
|
||||
);
|
||||
let preprocessor_config_path = files
|
||||
.config_path
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("preprocessor_config.json");
|
||||
write_json(&preprocessor_config_path, r#"{"size":[672,672]}"#);
|
||||
files.preprocessor_config_path = Some(preprocessor_config_path);
|
||||
|
||||
let backend = HfChatBackend::from_resolved_model_files(
|
||||
files.clone(),
|
||||
"test-model".to_string(),
|
||||
LoadModelBackendsOptions {
|
||||
language_model_only: true,
|
||||
chat_template_content_format: Default::default(),
|
||||
chat_template: None,
|
||||
default_chat_template_kwargs: HashMap::new(),
|
||||
..Default::default()
|
||||
},
|
||||
test_tokenizer(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
assert!(backend.multimodal_model_info().is_none());
|
||||
|
||||
let error = HfChatBackend::from_resolved_model_files(
|
||||
files,
|
||||
"test-model".to_string(),
|
||||
LoadModelBackendsOptions {
|
||||
chat_template_content_format: Default::default(),
|
||||
chat_template: None,
|
||||
default_chat_template_kwargs: HashMap::new(),
|
||||
..Default::default()
|
||||
},
|
||||
test_tokenizer(),
|
||||
)
|
||||
.err()
|
||||
.expect("invalid preprocessor config should fail without language_model_only");
|
||||
assert!(error.to_string().contains("failed to parse preprocessor_config.json"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn explicit_deepseek_renderer_overrides_generic_model_type() {
|
||||
let prompt = render_prompt(
|
||||
|
||||
@@ -60,6 +60,9 @@ pub type DynChatTextBackend = Arc<dyn ChatTextBackend>;
|
||||
pub struct LoadModelBackendsOptions {
|
||||
/// Which chat renderer implementation to use.
|
||||
pub renderer: RendererSelection,
|
||||
/// Disable frontend-side multimodal preprocessing and render the model as
|
||||
/// language-only.
|
||||
pub language_model_only: bool,
|
||||
/// How to serialize `message.content` when rendering the chat template.
|
||||
pub chat_template_content_format: ChatTemplateContentFormatOption,
|
||||
/// Optional server-default chat template override, provided either as an
|
||||
|
||||
@@ -116,6 +116,10 @@ pub struct SharedRuntimeArgs {
|
||||
#[arg(long = "tokenizer-mode", default_value_t)]
|
||||
#[serde(default, rename = "tokenizer_mode")]
|
||||
pub renderer: RendererSelection,
|
||||
/// Disable multimodal inputs and treat the model as language-only.
|
||||
#[arg(long)]
|
||||
#[serde(default)]
|
||||
pub language_model_only: bool,
|
||||
/// Override the maximum model context length. When set, the frontend uses
|
||||
/// this value instead of the model's `max_position_embeddings` from
|
||||
/// `config.json`.
|
||||
@@ -243,6 +247,7 @@ impl SharedRuntimeArgs {
|
||||
tool_call_parser: self.tool_call_parser,
|
||||
reasoning_parser: self.reasoning_parser,
|
||||
renderer: self.renderer,
|
||||
language_model_only: self.language_model_only,
|
||||
chat_template: self.chat_template,
|
||||
default_chat_template_kwargs: self.default_chat_template_kwargs,
|
||||
chat_template_content_format: self.chat_template_content_format,
|
||||
@@ -284,6 +289,7 @@ impl SharedRuntimeArgs {
|
||||
tool_call_parser: self.tool_call_parser,
|
||||
reasoning_parser: self.reasoning_parser,
|
||||
renderer: self.renderer,
|
||||
language_model_only: self.language_model_only,
|
||||
chat_template: self.chat_template,
|
||||
default_chat_template_kwargs: self.default_chat_template_kwargs,
|
||||
chat_template_content_format: self.chat_template_content_format,
|
||||
@@ -419,6 +425,7 @@ impl ServeArgs {
|
||||
self.managed_engine.clone().into_config(
|
||||
self.runtime.model.clone(),
|
||||
self.runtime.max_model_len,
|
||||
self.runtime.language_model_only,
|
||||
handshake_port,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -34,6 +34,7 @@ fn serve_args_forward_python_flags_with_separator() {
|
||||
tool_call_parser: Auto,
|
||||
reasoning_parser: Auto,
|
||||
renderer: Auto,
|
||||
language_model_only: false,
|
||||
max_model_len: Some(
|
||||
512,
|
||||
),
|
||||
@@ -263,6 +264,7 @@ fn frontend_args_accept_json() {
|
||||
tool_call_parser: Auto,
|
||||
reasoning_parser: Auto,
|
||||
renderer: Auto,
|
||||
language_model_only: false,
|
||||
max_model_len: None,
|
||||
grpc_port: None,
|
||||
shutdown_timeout: 0,
|
||||
@@ -321,7 +323,7 @@ fn frontend_args_json_accepts_supported_non_default_fields() {
|
||||
"--output-address",
|
||||
"ipc:///tmp/output.sock",
|
||||
"--args-json",
|
||||
r#"{"model_tag":"Qwen/Qwen3-0.6B","engine_ready_timeout_secs":42,"tool_call_parser":"hermes","reasoning_parser":"qwen3_thinking","tokenizer_mode":"deepseek_v32","max_model_len":8192,"shutdown_timeout":3}"#,
|
||||
r#"{"model_tag":"Qwen/Qwen3-0.6B","engine_ready_timeout_secs":42,"tool_call_parser":"hermes","reasoning_parser":"qwen3_thinking","tokenizer_mode":"deepseek_v32","language_model_only":true,"max_model_len":8192,"shutdown_timeout":3}"#,
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
@@ -338,6 +340,7 @@ fn frontend_args_json_accepts_supported_non_default_fields() {
|
||||
ParserSelection::Explicit("qwen3_thinking".to_string())
|
||||
);
|
||||
assert_eq!(args.runtime.renderer, RendererSelection::DeepSeekV32);
|
||||
assert!(args.runtime.language_model_only);
|
||||
assert_eq!(args.runtime.max_model_len, Some(8192));
|
||||
assert_eq!(args.runtime.shutdown_timeout, 3);
|
||||
}
|
||||
@@ -662,6 +665,7 @@ fn serve_args_accept_handshake_aliases() {
|
||||
tool_call_parser: Auto,
|
||||
reasoning_parser: Auto,
|
||||
renderer: Auto,
|
||||
language_model_only: false,
|
||||
max_model_len: None,
|
||||
grpc_port: None,
|
||||
shutdown_timeout: 0,
|
||||
@@ -783,6 +787,7 @@ fn serve_frontend_config_uses_dp_address_as_advertised_host() {
|
||||
tool_call_parser: Auto,
|
||||
reasoning_parser: Auto,
|
||||
renderer: Auto,
|
||||
language_model_only: false,
|
||||
chat_template: None,
|
||||
default_chat_template_kwargs: None,
|
||||
chat_template_content_format: Auto,
|
||||
@@ -846,6 +851,7 @@ fn serve_frontend_config_keeps_tcp_transport_for_non_local_only_topology() {
|
||||
tool_call_parser: Auto,
|
||||
reasoning_parser: Auto,
|
||||
renderer: Auto,
|
||||
language_model_only: false,
|
||||
chat_template: None,
|
||||
default_chat_template_kwargs: None,
|
||||
chat_template_content_format: Auto,
|
||||
@@ -924,6 +930,7 @@ fn frontend_config_uses_external_coordinator_when_coordinator_address_is_present
|
||||
tool_call_parser: Auto,
|
||||
reasoning_parser: Auto,
|
||||
renderer: Auto,
|
||||
language_model_only: false,
|
||||
chat_template: None,
|
||||
default_chat_template_kwargs: None,
|
||||
chat_template_content_format: Auto,
|
||||
|
||||
@@ -71,6 +71,7 @@ impl ManagedEngineArgs {
|
||||
self,
|
||||
model: String,
|
||||
max_model_len: Option<u32>,
|
||||
language_model_only: bool,
|
||||
handshake_port: u16,
|
||||
) -> ManagedEngineConfig {
|
||||
let mut python_args = self.python_args;
|
||||
@@ -79,6 +80,9 @@ impl ManagedEngineArgs {
|
||||
python_args.push("--max-model-len".to_string());
|
||||
python_args.push(max_model_len.to_string());
|
||||
}
|
||||
if language_model_only {
|
||||
python_args.push("--language-model-only".to_string());
|
||||
}
|
||||
if let Some(data_parallel_size_local) = self.data_parallel_size_local {
|
||||
python_args.push("--data-parallel-size-local".to_string());
|
||||
python_args.push(data_parallel_size_local.to_string());
|
||||
|
||||
@@ -64,6 +64,7 @@ async fn main() -> Result<()> {
|
||||
tool_call_parser: ParserSelection::Auto,
|
||||
reasoning_parser: ParserSelection::Auto,
|
||||
renderer: RendererSelection::Auto,
|
||||
language_model_only: false,
|
||||
chat_template: None,
|
||||
default_chat_template_kwargs: None,
|
||||
chat_template_content_format: ChatTemplateContentFormatOption::Auto,
|
||||
|
||||
@@ -53,6 +53,9 @@ pub struct Config {
|
||||
pub reasoning_parser: ParserSelection,
|
||||
/// Chat renderer selection.
|
||||
pub renderer: RendererSelection,
|
||||
/// Disable frontend-side multimodal preprocessing and render the model as
|
||||
/// language-only.
|
||||
pub language_model_only: bool,
|
||||
/// Server-default chat template override, as a file path or inline
|
||||
/// template.
|
||||
pub chat_template: Option<String>,
|
||||
|
||||
@@ -42,6 +42,7 @@ async fn build_state(config: &Config) -> Result<Arc<AppState>> {
|
||||
&config.model,
|
||||
LoadModelBackendsOptions {
|
||||
renderer: config.renderer,
|
||||
language_model_only: config.language_model_only,
|
||||
chat_template: config.chat_template.clone(),
|
||||
chat_template_content_format: config.chat_template_content_format,
|
||||
default_chat_template_kwargs: config
|
||||
|
||||
Reference in New Issue
Block a user