[model] support FireRedLID (#39290)

Signed-off-by: PatchouliTaisa <patchychen@tencent.com> Co-authored-by: PatchouliTaisa <patchychen@tencent.com>
2026-06-06 00:16:14 +00:00 · 2026-04-10 16:43:58 +08:00
parent 8e8a3becd1
commit 967146e7bd
14 changed files with 1803 additions and 347 deletions
@@ -537,9 +537,30 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
    )


+# FireRedLID
+def run_fireredlid(question: str, audio_count: int) -> ModelRequestData:
+    assert audio_count == 1, "FireRedLID only supports single audio input per prompt"
+    model_name = "PatchyTisa/FireRedLID-vllm"
+
+    prompt = "<sos>"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
 model_example_map = {
    "audioflamingo3": run_audioflamingo3,
    "cohere_asr": run_cohere_asr,
+    "fireredlid": run_fireredlid,
    "funaudiochat": run_funaudiochat,
    "gemma3n": run_gemma3n,
    "glmasr": run_glmasr,
@@ -55,7 +55,91 @@ def run_whisper():
    )


+def run_fireredasr2():
+    """
+    FireRedASR2 – Automatic Speech Recognition model.
+
+    This model uses a Conformer encoder + Qwen2 LLM decoder architecture
+    for speech-to-text transcription.  Audio is passed via the implicit
+    prompt format with the ``<|AUDIO|>`` placeholder token.
+    """
+    engine_args = EngineArgs(
+        model="allendou/FireRedASR2-LLM-vllm",
+        max_model_len=448,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"audio": 1},
+    )
+
+    prompt_str = (
+        "<|im_start|>user\n<|AUDIO|>请转写音频为文字<|im_end|>\n<|im_start|>assistant\n"
+    )
+
+    prompts = [
+        {  # Implicit prompt with audio
+            "prompt": prompt_str,
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+            },
+        },
+        {  # Another audio sample
+            "prompt": prompt_str,
+            "multi_modal_data": {
+                "audio": AudioAsset("winning_call").audio_and_sample_rate,
+            },
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+def run_fireredlid():
+    """
+    FireRedLID – Language Identification model.
+
+    This encoder-decoder model identifies the spoken language of an audio
+    clip. It outputs at most 2 tokens representing the detected language
+    (e.g. "en", "zh mandarin").
+    """
+    engine_args = EngineArgs(
+        model="PatchyTisa/FireRedLID-vllm",
+        max_model_len=8,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"audio": 1},
+    )
+
+    prompts = [
+        {  # Test explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "",
+                "multi_modal_data": {
+                    "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                },
+            },
+            "decoder_prompt": "<sos>",
+        },
+        {  # Another audio sample
+            "encoder_prompt": {
+                "prompt": "",
+                "multi_modal_data": {
+                    "audio": AudioAsset("winning_call").audio_and_sample_rate,
+                },
+            },
+            "decoder_prompt": "<sos>",
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 model_example_map = {
+    "fireredasr2": run_fireredasr2,
+    "fireredlid": run_fireredlid,
    "whisper": run_whisper,
 }

@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Language Identification (LID) demo using the FireRedLID model on vLLM.
+
+FireRedLID is an audio encoder-decoder model that identifies the spoken
+language of an audio clip. Unlike ASR models that output full transcriptions,
+FireRedLID outputs at most 2 tokens representing the detected language
+(e.g. "en", "zh mandarin").
+
+Start the vLLM server:
+
+    vllm serve PatchyTisa/FireRedLID-vllm
+
+Then run this script:
+
+    # Use the built-in sample audio
+    python examples/online_serving/openai_lid_client.py
+
+    # Use your own audio file(s)
+    python examples/online_serving/openai_lid_client.py \
+        --audio_paths audio_en.wav audio_zh.wav audio_fr.wav
+
+    # Batch-identify multiple files in one run
+    python examples/online_serving/openai_lid_client.py \
+        --audio_paths /path/to/dir/*.wav
+
+Requirements:
+- vLLM with audio support
+- openai Python SDK
+- kaldi_native_fbank (pulled in by the model)
+"""
+
+import argparse
+import json
+import os
+
+from openai import OpenAI
+
+from vllm.assets.audio import AudioAsset
+
+# ──────────────────────────────────────────────────────────────────────
+# Helpers
+# ──────────────────────────────────────────────────────────────────────
+
+
+def identify_language(
+    audio_path: str,
+    client: OpenAI,
+    model: str,
+) -> str:
+    """
+    Send a single audio file to the vLLM transcription endpoint and return
+    the detected language tag.
+
+    FireRedLID re-uses the OpenAI-compatible ``/v1/audio/transcriptions``
+    endpoint. The "transcription" it returns is actually the language label
+    (e.g. ``"en"`` or ``"zh mandarin"``).
+    """
+    with open(audio_path, "rb") as f:
+        result = client.audio.transcriptions.create(
+            file=f,
+            model=model,
+            response_format="json",
+            temperature=0.0,
+        )
+    return result.text.strip()
+
+
+def identify_language_raw(
+    audio_path: str,
+    model: str,
+    api_base: str,
+) -> str:
+    """
+    Same as :func:`identify_language` but uses raw HTTP so that the demo
+    works without the ``openai`` SDK (useful for quick debugging).
+    """
+    import requests
+
+    url = f"{api_base}/audio/transcriptions"
+    with open(audio_path, "rb") as f:
+        files = {"file": (os.path.basename(audio_path), f)}
+        data = {
+            "model": model,
+            "response_format": "json",
+        }
+        resp = requests.post(url, files=files, data=data)
+        resp.raise_for_status()
+    return resp.json()["text"].strip()
+
+
+def identify_language_streaming(
+    audio_path: str,
+    model: str,
+    api_base: str,
+) -> str:
+    """
+    Streaming variant – demonstrates the streaming transcription endpoint.
+    For a 1-2 token output the stream finishes almost instantly, but this
+    shows that the API path works end-to-end.
+    """
+    import requests
+
+    url = f"{api_base}/audio/transcriptions"
+    with open(audio_path, "rb") as f:
+        files = {"file": (os.path.basename(audio_path), f)}
+        data = {
+            "stream": "true",
+            "model": model,
+            "response_format": "json",
+        }
+        response = requests.post(url, files=files, data=data, stream=True)
+        response.raise_for_status()
+
+        tokens: list[str] = []
+        for chunk in response.iter_lines(
+            chunk_size=8192, decode_unicode=False, delimiter=b"\n"
+        ):
+            if not chunk:
+                continue
+            payload = json.loads(chunk[len("data: ") :].decode("utf-8"))
+            choice = payload["choices"][0]
+            delta = choice.get("delta", {}).get("content", "")
+            if delta:
+                tokens.append(delta)
+            if choice.get("finish_reason") is not None:
+                break
+
+    return "".join(tokens).strip()
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Main
+# ──────────────────────────────────────────────────────────────────────
+
+
+def main(args: argparse.Namespace) -> None:
+    api_base = args.api_base.rstrip("/")
+    client = OpenAI(api_key="EMPTY", base_url=api_base)
+    model = client.models.list().data[0].id
+    print(f"Model : {model}")
+    print(f"Server: {api_base}\n")
+
+    # Resolve audio paths ------------------------------------------------
+    if args.audio_paths:
+        audio_paths = args.audio_paths
+    else:
+        # Fall back to the built-in vLLM sample audios (both are English).
+        audio_paths = [
+            str(AudioAsset("mary_had_lamb").get_local_path()),
+            str(AudioAsset("winning_call").get_local_path()),
+        ]
+
+    # Run LID for each file ----------------------------------------------
+    print(f"{'Audio File':<50} {'Language (sync)':<20} {'Language (stream)'}")
+    print("-" * 90)
+
+    for path in audio_paths:
+        basename = os.path.basename(path)
+
+        # 1) Synchronous via OpenAI SDK
+        lang_sync = identify_language(path, client, model)
+
+        # 2) Streaming via raw HTTP
+        lang_stream = identify_language_streaming(path, model, api_base)
+
+        print(f"{basename:<50} {lang_sync:<20} {lang_stream}")
+
+    print()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="FireRedLID – Language Identification demo via vLLM",
+    )
+    parser.add_argument(
+        "--audio_paths",
+        nargs="+",
+        default=None,
+        help=(
+            "One or more audio files to identify. "
+            "If omitted, uses vLLM's built-in sample audios."
+        ),
+    )
+    parser.add_argument(
+        "--api_base",
+        type=str,
+        default="http://localhost:8000/v1",
+        help="vLLM API base URL (default: http://localhost:8000/v1)",
+    )
+    args = parser.parse_args()
+    main(args)