[Rust Frontend] Skip loading multimodal processor if --language-model-only is specified (#44500)

Signed-off-by: Bugen Zhao <i@bugenzhao.com>
This commit is contained in:
Bugen Zhao
2026-06-05 08:02:54 +08:00
committed by GitHub
parent b7c5baf63d
commit 62d6f06e3d
8 changed files with 87 additions and 8 deletions
+55 -2
View File
@@ -38,13 +38,17 @@ impl HfChatBackend {
) -> Result<Self> {
let model_config = load_model_config(files.config_path.as_deref())?;
let model_type = model_config.model_type().unwrap_or_default();
let multimodal_model_info = MultimodalModelInfo::from_paths(
let multimodal_model_info = if options.language_model_only {
None
} else {
MultimodalModelInfo::from_paths(
model_id.clone(),
(!model_type.is_empty()).then_some(model_type.to_string()),
files.config_path.as_deref(),
files.preprocessor_config_path.as_deref(),
tokenizer.clone(),
)?;
)?
};
let multimodal_render_info = resolve_multimodal_render_info(multimodal_model_info.as_ref());
let renderer = options.renderer.resolve(model_type);
@@ -225,6 +229,7 @@ mod tests {
"test-model".to_string(),
LoadModelBackendsOptions {
renderer,
language_model_only: false,
chat_template_content_format: Default::default(),
chat_template: None,
default_chat_template_kwargs: HashMap::new(),
@@ -267,6 +272,54 @@ mod tests {
assert_eq!(prompt, "hello");
}
#[test]
fn language_model_only_skips_multimodal_preprocessor_config() {
let mut files = resolved_files(
r#"{"model_type":"deepseek_v0_vl"}"#,
r#"{"chat_template":"{{ messages[0].content }}"}"#,
);
let preprocessor_config_path = files
.config_path
.as_ref()
.unwrap()
.parent()
.unwrap()
.join("preprocessor_config.json");
write_json(&preprocessor_config_path, r#"{"size":[672,672]}"#);
files.preprocessor_config_path = Some(preprocessor_config_path);
let backend = HfChatBackend::from_resolved_model_files(
files.clone(),
"test-model".to_string(),
LoadModelBackendsOptions {
language_model_only: true,
chat_template_content_format: Default::default(),
chat_template: None,
default_chat_template_kwargs: HashMap::new(),
..Default::default()
},
test_tokenizer(),
)
.unwrap();
assert!(backend.multimodal_model_info().is_none());
let error = HfChatBackend::from_resolved_model_files(
files,
"test-model".to_string(),
LoadModelBackendsOptions {
chat_template_content_format: Default::default(),
chat_template: None,
default_chat_template_kwargs: HashMap::new(),
..Default::default()
},
test_tokenizer(),
)
.err()
.expect("invalid preprocessor config should fail without language_model_only");
assert!(error.to_string().contains("failed to parse preprocessor_config.json"));
}
#[test]
fn explicit_deepseek_renderer_overrides_generic_model_type() {
let prompt = render_prompt(
+3
View File
@@ -60,6 +60,9 @@ pub type DynChatTextBackend = Arc<dyn ChatTextBackend>;
pub struct LoadModelBackendsOptions {
/// Which chat renderer implementation to use.
pub renderer: RendererSelection,
/// Disable frontend-side multimodal preprocessing and render the model as
/// language-only.
pub language_model_only: bool,
/// How to serialize `message.content` when rendering the chat template.
pub chat_template_content_format: ChatTemplateContentFormatOption,
/// Optional server-default chat template override, provided either as an
+7
View File
@@ -116,6 +116,10 @@ pub struct SharedRuntimeArgs {
#[arg(long = "tokenizer-mode", default_value_t)]
#[serde(default, rename = "tokenizer_mode")]
pub renderer: RendererSelection,
/// Disable multimodal inputs and treat the model as language-only.
#[arg(long)]
#[serde(default)]
pub language_model_only: bool,
/// Override the maximum model context length. When set, the frontend uses
/// this value instead of the model's `max_position_embeddings` from
/// `config.json`.
@@ -243,6 +247,7 @@ impl SharedRuntimeArgs {
tool_call_parser: self.tool_call_parser,
reasoning_parser: self.reasoning_parser,
renderer: self.renderer,
language_model_only: self.language_model_only,
chat_template: self.chat_template,
default_chat_template_kwargs: self.default_chat_template_kwargs,
chat_template_content_format: self.chat_template_content_format,
@@ -284,6 +289,7 @@ impl SharedRuntimeArgs {
tool_call_parser: self.tool_call_parser,
reasoning_parser: self.reasoning_parser,
renderer: self.renderer,
language_model_only: self.language_model_only,
chat_template: self.chat_template,
default_chat_template_kwargs: self.default_chat_template_kwargs,
chat_template_content_format: self.chat_template_content_format,
@@ -419,6 +425,7 @@ impl ServeArgs {
self.managed_engine.clone().into_config(
self.runtime.model.clone(),
self.runtime.max_model_len,
self.runtime.language_model_only,
handshake_port,
)
}
+8 -1
View File
@@ -34,6 +34,7 @@ fn serve_args_forward_python_flags_with_separator() {
tool_call_parser: Auto,
reasoning_parser: Auto,
renderer: Auto,
language_model_only: false,
max_model_len: Some(
512,
),
@@ -263,6 +264,7 @@ fn frontend_args_accept_json() {
tool_call_parser: Auto,
reasoning_parser: Auto,
renderer: Auto,
language_model_only: false,
max_model_len: None,
grpc_port: None,
shutdown_timeout: 0,
@@ -321,7 +323,7 @@ fn frontend_args_json_accepts_supported_non_default_fields() {
"--output-address",
"ipc:///tmp/output.sock",
"--args-json",
r#"{"model_tag":"Qwen/Qwen3-0.6B","engine_ready_timeout_secs":42,"tool_call_parser":"hermes","reasoning_parser":"qwen3_thinking","tokenizer_mode":"deepseek_v32","max_model_len":8192,"shutdown_timeout":3}"#,
r#"{"model_tag":"Qwen/Qwen3-0.6B","engine_ready_timeout_secs":42,"tool_call_parser":"hermes","reasoning_parser":"qwen3_thinking","tokenizer_mode":"deepseek_v32","language_model_only":true,"max_model_len":8192,"shutdown_timeout":3}"#,
])
.unwrap();
@@ -338,6 +340,7 @@ fn frontend_args_json_accepts_supported_non_default_fields() {
ParserSelection::Explicit("qwen3_thinking".to_string())
);
assert_eq!(args.runtime.renderer, RendererSelection::DeepSeekV32);
assert!(args.runtime.language_model_only);
assert_eq!(args.runtime.max_model_len, Some(8192));
assert_eq!(args.runtime.shutdown_timeout, 3);
}
@@ -662,6 +665,7 @@ fn serve_args_accept_handshake_aliases() {
tool_call_parser: Auto,
reasoning_parser: Auto,
renderer: Auto,
language_model_only: false,
max_model_len: None,
grpc_port: None,
shutdown_timeout: 0,
@@ -783,6 +787,7 @@ fn serve_frontend_config_uses_dp_address_as_advertised_host() {
tool_call_parser: Auto,
reasoning_parser: Auto,
renderer: Auto,
language_model_only: false,
chat_template: None,
default_chat_template_kwargs: None,
chat_template_content_format: Auto,
@@ -846,6 +851,7 @@ fn serve_frontend_config_keeps_tcp_transport_for_non_local_only_topology() {
tool_call_parser: Auto,
reasoning_parser: Auto,
renderer: Auto,
language_model_only: false,
chat_template: None,
default_chat_template_kwargs: None,
chat_template_content_format: Auto,
@@ -924,6 +930,7 @@ fn frontend_config_uses_external_coordinator_when_coordinator_address_is_present
tool_call_parser: Auto,
reasoning_parser: Auto,
renderer: Auto,
language_model_only: false,
chat_template: None,
default_chat_template_kwargs: None,
chat_template_content_format: Auto,
+4
View File
@@ -71,6 +71,7 @@ impl ManagedEngineArgs {
self,
model: String,
max_model_len: Option<u32>,
language_model_only: bool,
handshake_port: u16,
) -> ManagedEngineConfig {
let mut python_args = self.python_args;
@@ -79,6 +80,9 @@ impl ManagedEngineArgs {
python_args.push("--max-model-len".to_string());
python_args.push(max_model_len.to_string());
}
if language_model_only {
python_args.push("--language-model-only".to_string());
}
if let Some(data_parallel_size_local) = self.data_parallel_size_local {
python_args.push("--data-parallel-size-local".to_string());
python_args.push(data_parallel_size_local.to_string());
@@ -64,6 +64,7 @@ async fn main() -> Result<()> {
tool_call_parser: ParserSelection::Auto,
reasoning_parser: ParserSelection::Auto,
renderer: RendererSelection::Auto,
language_model_only: false,
chat_template: None,
default_chat_template_kwargs: None,
chat_template_content_format: ChatTemplateContentFormatOption::Auto,
+3
View File
@@ -53,6 +53,9 @@ pub struct Config {
pub reasoning_parser: ParserSelection,
/// Chat renderer selection.
pub renderer: RendererSelection,
/// Disable frontend-side multimodal preprocessing and render the model as
/// language-only.
pub language_model_only: bool,
/// Server-default chat template override, as a file path or inline
/// template.
pub chat_template: Option<String>,
+1
View File
@@ -42,6 +42,7 @@ async fn build_state(config: &Config) -> Result<Arc<AppState>> {
&config.model,
LoadModelBackendsOptions {
renderer: config.renderer,
language_model_only: config.language_model_only,
chat_template: config.chat_template.clone(),
chat_template_content_format: config.chat_template_content_format,
default_chat_template_kwargs: config