mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[model] support FireRedLID (#39290)
Signed-off-by: PatchouliTaisa <patchychen@tencent.com> Co-authored-by: PatchouliTaisa <patchychen@tencent.com>
This commit is contained in:
@@ -537,9 +537,30 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# FireRedLID
|
||||
def run_fireredlid(question: str, audio_count: int) -> ModelRequestData:
|
||||
assert audio_count == 1, "FireRedLID only supports single audio input per prompt"
|
||||
model_name = "PatchyTisa/FireRedLID-vllm"
|
||||
|
||||
prompt = "<sos>"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"audioflamingo3": run_audioflamingo3,
|
||||
"cohere_asr": run_cohere_asr,
|
||||
"fireredlid": run_fireredlid,
|
||||
"funaudiochat": run_funaudiochat,
|
||||
"gemma3n": run_gemma3n,
|
||||
"glmasr": run_glmasr,
|
||||
|
||||
@@ -55,7 +55,91 @@ def run_whisper():
|
||||
)
|
||||
|
||||
|
||||
def run_fireredasr2():
|
||||
"""
|
||||
FireRedASR2 – Automatic Speech Recognition model.
|
||||
|
||||
This model uses a Conformer encoder + Qwen2 LLM decoder architecture
|
||||
for speech-to-text transcription. Audio is passed via the implicit
|
||||
prompt format with the ``<|AUDIO|>`` placeholder token.
|
||||
"""
|
||||
engine_args = EngineArgs(
|
||||
model="allendou/FireRedASR2-LLM-vllm",
|
||||
max_model_len=448,
|
||||
max_num_seqs=16,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
)
|
||||
|
||||
prompt_str = (
|
||||
"<|im_start|>user\n<|AUDIO|>请转写音频为文字<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
prompts = [
|
||||
{ # Implicit prompt with audio
|
||||
"prompt": prompt_str,
|
||||
"multi_modal_data": {
|
||||
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
|
||||
},
|
||||
},
|
||||
{ # Another audio sample
|
||||
"prompt": prompt_str,
|
||||
"multi_modal_data": {
|
||||
"audio": AudioAsset("winning_call").audio_and_sample_rate,
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_fireredlid():
|
||||
"""
|
||||
FireRedLID – Language Identification model.
|
||||
|
||||
This encoder-decoder model identifies the spoken language of an audio
|
||||
clip. It outputs at most 2 tokens representing the detected language
|
||||
(e.g. "en", "zh mandarin").
|
||||
"""
|
||||
engine_args = EngineArgs(
|
||||
model="PatchyTisa/FireRedLID-vllm",
|
||||
max_model_len=8,
|
||||
max_num_seqs=16,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
)
|
||||
|
||||
prompts = [
|
||||
{ # Test explicit encoder/decoder prompt
|
||||
"encoder_prompt": {
|
||||
"prompt": "",
|
||||
"multi_modal_data": {
|
||||
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
|
||||
},
|
||||
},
|
||||
"decoder_prompt": "<sos>",
|
||||
},
|
||||
{ # Another audio sample
|
||||
"encoder_prompt": {
|
||||
"prompt": "",
|
||||
"multi_modal_data": {
|
||||
"audio": AudioAsset("winning_call").audio_and_sample_rate,
|
||||
},
|
||||
},
|
||||
"decoder_prompt": "<sos>",
|
||||
},
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"fireredasr2": run_fireredasr2,
|
||||
"fireredlid": run_fireredlid,
|
||||
"whisper": run_whisper,
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,193 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Language Identification (LID) demo using the FireRedLID model on vLLM.
|
||||
|
||||
FireRedLID is an audio encoder-decoder model that identifies the spoken
|
||||
language of an audio clip. Unlike ASR models that output full transcriptions,
|
||||
FireRedLID outputs at most 2 tokens representing the detected language
|
||||
(e.g. "en", "zh mandarin").
|
||||
|
||||
Start the vLLM server:
|
||||
|
||||
vllm serve PatchyTisa/FireRedLID-vllm
|
||||
|
||||
Then run this script:
|
||||
|
||||
# Use the built-in sample audio
|
||||
python examples/online_serving/openai_lid_client.py
|
||||
|
||||
# Use your own audio file(s)
|
||||
python examples/online_serving/openai_lid_client.py \
|
||||
--audio_paths audio_en.wav audio_zh.wav audio_fr.wav
|
||||
|
||||
# Batch-identify multiple files in one run
|
||||
python examples/online_serving/openai_lid_client.py \
|
||||
--audio_paths /path/to/dir/*.wav
|
||||
|
||||
Requirements:
|
||||
- vLLM with audio support
|
||||
- openai Python SDK
|
||||
- kaldi_native_fbank (pulled in by the model)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Helpers
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def identify_language(
|
||||
audio_path: str,
|
||||
client: OpenAI,
|
||||
model: str,
|
||||
) -> str:
|
||||
"""
|
||||
Send a single audio file to the vLLM transcription endpoint and return
|
||||
the detected language tag.
|
||||
|
||||
FireRedLID re-uses the OpenAI-compatible ``/v1/audio/transcriptions``
|
||||
endpoint. The "transcription" it returns is actually the language label
|
||||
(e.g. ``"en"`` or ``"zh mandarin"``).
|
||||
"""
|
||||
with open(audio_path, "rb") as f:
|
||||
result = client.audio.transcriptions.create(
|
||||
file=f,
|
||||
model=model,
|
||||
response_format="json",
|
||||
temperature=0.0,
|
||||
)
|
||||
return result.text.strip()
|
||||
|
||||
|
||||
def identify_language_raw(
|
||||
audio_path: str,
|
||||
model: str,
|
||||
api_base: str,
|
||||
) -> str:
|
||||
"""
|
||||
Same as :func:`identify_language` but uses raw HTTP so that the demo
|
||||
works without the ``openai`` SDK (useful for quick debugging).
|
||||
"""
|
||||
import requests
|
||||
|
||||
url = f"{api_base}/audio/transcriptions"
|
||||
with open(audio_path, "rb") as f:
|
||||
files = {"file": (os.path.basename(audio_path), f)}
|
||||
data = {
|
||||
"model": model,
|
||||
"response_format": "json",
|
||||
}
|
||||
resp = requests.post(url, files=files, data=data)
|
||||
resp.raise_for_status()
|
||||
return resp.json()["text"].strip()
|
||||
|
||||
|
||||
def identify_language_streaming(
|
||||
audio_path: str,
|
||||
model: str,
|
||||
api_base: str,
|
||||
) -> str:
|
||||
"""
|
||||
Streaming variant – demonstrates the streaming transcription endpoint.
|
||||
For a 1-2 token output the stream finishes almost instantly, but this
|
||||
shows that the API path works end-to-end.
|
||||
"""
|
||||
import requests
|
||||
|
||||
url = f"{api_base}/audio/transcriptions"
|
||||
with open(audio_path, "rb") as f:
|
||||
files = {"file": (os.path.basename(audio_path), f)}
|
||||
data = {
|
||||
"stream": "true",
|
||||
"model": model,
|
||||
"response_format": "json",
|
||||
}
|
||||
response = requests.post(url, files=files, data=data, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
tokens: list[str] = []
|
||||
for chunk in response.iter_lines(
|
||||
chunk_size=8192, decode_unicode=False, delimiter=b"\n"
|
||||
):
|
||||
if not chunk:
|
||||
continue
|
||||
payload = json.loads(chunk[len("data: ") :].decode("utf-8"))
|
||||
choice = payload["choices"][0]
|
||||
delta = choice.get("delta", {}).get("content", "")
|
||||
if delta:
|
||||
tokens.append(delta)
|
||||
if choice.get("finish_reason") is not None:
|
||||
break
|
||||
|
||||
return "".join(tokens).strip()
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Main
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def main(args: argparse.Namespace) -> None:
|
||||
api_base = args.api_base.rstrip("/")
|
||||
client = OpenAI(api_key="EMPTY", base_url=api_base)
|
||||
model = client.models.list().data[0].id
|
||||
print(f"Model : {model}")
|
||||
print(f"Server: {api_base}\n")
|
||||
|
||||
# Resolve audio paths ------------------------------------------------
|
||||
if args.audio_paths:
|
||||
audio_paths = args.audio_paths
|
||||
else:
|
||||
# Fall back to the built-in vLLM sample audios (both are English).
|
||||
audio_paths = [
|
||||
str(AudioAsset("mary_had_lamb").get_local_path()),
|
||||
str(AudioAsset("winning_call").get_local_path()),
|
||||
]
|
||||
|
||||
# Run LID for each file ----------------------------------------------
|
||||
print(f"{'Audio File':<50} {'Language (sync)':<20} {'Language (stream)'}")
|
||||
print("-" * 90)
|
||||
|
||||
for path in audio_paths:
|
||||
basename = os.path.basename(path)
|
||||
|
||||
# 1) Synchronous via OpenAI SDK
|
||||
lang_sync = identify_language(path, client, model)
|
||||
|
||||
# 2) Streaming via raw HTTP
|
||||
lang_stream = identify_language_streaming(path, model, api_base)
|
||||
|
||||
print(f"{basename:<50} {lang_sync:<20} {lang_stream}")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="FireRedLID – Language Identification demo via vLLM",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--audio_paths",
|
||||
nargs="+",
|
||||
default=None,
|
||||
help=(
|
||||
"One or more audio files to identify. "
|
||||
"If omitted, uses vLLM's built-in sample audios."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api_base",
|
||||
type=str,
|
||||
default="http://localhost:8000/v1",
|
||||
help="vLLM API base URL (default: http://localhost:8000/v1)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
Reference in New Issue
Block a user