Update to transformers v5 (#30566)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: khluu <khluu000@gmail.com>
Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: khluu <khluu000@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: jiang1.li <jiang1.li@intel.com>
This commit is contained in:
Harry Mellor
2026-04-16 00:29:15 +01:00
committed by GitHub
parent 6dc9491406
commit 03f8d3a548
41 changed files with 445 additions and 115 deletions
@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 -e VLLM_CPU_ATTN_SPLIT_KV=0 --shm-size=4g "$IMAGE_NAME" \
timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
+15 -1
View File
@@ -4,7 +4,6 @@ depends_on:
steps:
- label: Basic Models Tests (Initialization)
timeout_in_minutes: 45
device: h200_18gb
torch_nightly: true
source_file_dependencies:
- vllm/
@@ -73,3 +72,18 @@ steps:
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Transformers Backward Compatibility Models Test
working_dir: "/vllm-workspace/"
optional: true
soft_fail: true
commands:
- pip install transformers==4.57.5
- pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
+5 -4
View File
@@ -642,7 +642,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
else \
BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
fi; \
uv pip install --system accelerate hf_transfer modelscope \
uv pip install --system accelerate modelscope \
"bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}"
# ============================================================
@@ -756,9 +756,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1
ENV HF_XET_HIGH_PERFORMANCE 1
# increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
# Copy in the v1 package for testing (it isn't distributed yet)
COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+6
View File
@@ -197,6 +197,12 @@ ADD ./.buildkite/ ./.buildkite/
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils
# enable fast downloads from hf (for testing)
ENV HF_XET_HIGH_PERFORMANCE 1
# increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
######################### RELEASE IMAGE #########################
FROM base AS vllm-openai
+4 -3
View File
@@ -272,9 +272,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1
ENV HF_XET_HIGH_PERFORMANCE 1
# increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/test/nightly-torch.txt
+4 -3
View File
@@ -365,9 +365,10 @@ RUN cd /vllm-workspace \
&& python3 -m pip install pytest-shard
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV HF_XET_HIGH_PERFORMANCE=1
# increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
# install audio decode package `torchcodec` from source (required due to
# ROCm and torch version mismatch) for tests with datasets package
@@ -240,7 +240,7 @@ uv pip install vllm==${VLLM_VERSION} \
# Install dependencies
pip install --upgrade numba \
scipy \
huggingface-hub[cli,hf_transfer] \
huggingface-hub[cli] \
setuptools_scm
pip install -r requirements/rocm.txt
+2 -2
View File
@@ -7,7 +7,7 @@ requests >= 2.26.0
tqdm
blake3
py-cpuinfo
transformers >= 4.56.0, < 5
transformers >= 4.56.0, != 5.0.*, != 5.1.*, != 5.2.*, != 5.3.*, != 5.4.*, != 5.5.0
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
@@ -37,7 +37,7 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
compressed-tensors == 0.14.0.1 # required for compressed-tensors
compressed-tensors == 0.15.0.1 # required for compressed-tensors
depyf==0.20.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files
+3 -3
View File
@@ -18,7 +18,7 @@ httpx
librosa # required for audio tests
vector_quantize_pytorch # required for minicpmo_26 test
vocos # required for minicpmo_26 test
peft>=0.15.0 # required for phi-4-mm test
peft>=0.18.1 # required for phi-4-mm test
pqdm
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
resampy # required for audio tests
@@ -39,8 +39,8 @@ opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5
tokenizers==0.22.0
transformers==5.5.3
tokenizers==0.22.2
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
bitsandbytes==0.49.2
+10 -10
View File
@@ -4,7 +4,7 @@ absl-py==2.1.0
# via
# rouge-score
# tensorboard
accelerate==1.0.1
accelerate==1.13.0
# via peft
aenum==3.1.16
# via lightly
@@ -248,7 +248,6 @@ filelock==3.16.1
# huggingface-hub
# ray
# torch
# transformers
# virtualenv
fiona==1.10.1
# via torchgeo
@@ -331,7 +330,7 @@ h5py==3.13.0
# via terratorch
harfile==0.3.0
# via schemathesis
hf-xet==1.1.7
hf-xet==1.4.3
# via huggingface-hub
hiredis==3.0.0
# via tensorizer
@@ -345,9 +344,10 @@ httpx==0.27.2
# via
# -r requirements/test/cuda.in
# diffusers
# huggingface-hub
# perceptron
# schemathesis
huggingface-hub==0.36.2
huggingface-hub==1.10.2
# via
# accelerate
# datasets
@@ -756,7 +756,7 @@ pathvalidate==3.2.1
# via pytablewriter
patsy==1.0.1
# via statsmodels
peft==0.16.0
peft==0.18.1
# via -r requirements/test/cuda.in
perceptron==0.1.4
# via -r requirements/test/cuda.in
@@ -982,7 +982,7 @@ referencing==0.35.1
# via
# jsonschema
# jsonschema-specifications
regex==2024.9.11
regex==2026.2.28
# via
# diffusers
# nltk
@@ -1002,7 +1002,6 @@ requests==2.32.3
# google-api-core
# google-cloud-storage
# gpt-oss
# huggingface-hub
# lightly
# lm-eval
# mistral-common
@@ -1015,7 +1014,6 @@ requests==2.32.3
# starlette-testclient
# tacoreader
# tiktoken
# transformers
# wandb
resampy==0.4.3
# via -r requirements/test/cuda.in
@@ -1216,7 +1214,7 @@ timm==1.0.17
# segmentation-models-pytorch
# terratorch
# torchgeo
tokenizers==0.22.0
tokenizers==0.22.2
# via
# -c requirements/common.txt
# -r requirements/test/cuda.in
@@ -1295,7 +1293,7 @@ tqdm==4.67.3
# tacoreader
# terratorch
# transformers
transformers==4.57.5
transformers==5.5.3
# via
# -c requirements/common.txt
# -r requirements/test/cuda.in
@@ -1317,7 +1315,9 @@ typepy==1.3.2
typer==0.15.2
# via
# fastsafetensors
# huggingface-hub
# perceptron
# transformers
types-python-dateutil==2.9.0.20241206
# via arrow
typeshed-client==2.8.2
+2 -2
View File
@@ -29,8 +29,8 @@ opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5
tokenizers==0.22.0
transformers==5.5.3
tokenizers==0.22.2
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
bitsandbytes>=0.49.2
+2 -3
View File
@@ -38,8 +38,8 @@ opencv-python-headless>=4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5
tokenizers==0.22.0
transformers==5.5.3
tokenizers==0.22.2
schemathesis>=3.39.15 # Required for openai schema test
# quantization
bitsandbytes==0.49.2
@@ -82,4 +82,3 @@ plotly # required for perf comparison html report
rapidfuzz
torchgeo==0.7.0
multiprocess==0.70.16
huggingface-hub==0.36.2
+15 -16
View File
@@ -39,7 +39,7 @@ annotated-doc==0.0.4
# typer
annotated-types==0.7.0
# via pydantic
anthropic==0.89.0
anthropic==0.93.0
# via
# -c requirements/common.txt
# -r requirements/test/../common.txt
@@ -172,7 +172,7 @@ colorful==0.5.8
# via ray
colorlog==6.10.1
# via optuna
compressed-tensors==0.14.0.1
compressed-tensors==0.15.0.1
# via
# -c requirements/common.txt
# -r requirements/test/../common.txt
@@ -269,9 +269,9 @@ fastapi==0.135.2
# model-hosting-container-standards
fastapi-cli==0.0.24
# via fastapi
fastapi-cloud-cli==0.15.1
fastapi-cloud-cli==0.16.1
# via fastapi-cli
fastar==0.9.0
fastar==0.10.0
# via fastapi-cloud-cli
fastparquet==2026.3.0
# via genai-perf
@@ -290,7 +290,6 @@ filelock==3.25.2
# python-discovery
# ray
# torch
# transformers
# virtualenv
fiona==1.10.1
# via torchgeo
@@ -384,7 +383,7 @@ h5py==3.16.0
# via terratorch
harfile==0.4.0
# via schemathesis
hf-xet==1.4.2
hf-xet==1.4.3
# via huggingface-hub
hiredis==3.3.1
# via tensorizer
@@ -403,6 +402,7 @@ httpx==0.27.2
# diffusers
# fastapi
# fastapi-cloud-cli
# huggingface-hub
# mcp
# model-hosting-container-standards
# openai
@@ -410,9 +410,8 @@ httpx==0.27.2
# schemathesis
httpx-sse==0.4.3
# via mcp
huggingface-hub==0.36.2
huggingface-hub==1.10.2
# via
# -r requirements/test/rocm.in
# accelerate
# datasets
# diffusers
@@ -484,7 +483,7 @@ jinja2==3.1.6
# genai-perf
# lm-eval
# torch
jiter==0.13.0
jiter==0.14.0
# via
# anthropic
# openai
@@ -631,7 +630,7 @@ msgpack==1.1.2
# via
# librosa
# ray
msgspec==0.20.0
msgspec==0.21.0
# via -r requirements/test/../common.txt
mteb==2.11.5
# via -r requirements/test/rocm.in
@@ -742,7 +741,7 @@ omegaconf==2.3.0
# lightning
open-clip-torch==2.32.0
# via -r requirements/test/rocm.in
openai==2.30.0
openai==2.31.0
# via
# -c requirements/common.txt
# -r requirements/test/../common.txt
@@ -1093,7 +1092,7 @@ python-dotenv==1.2.2
# uvicorn
python-json-logger==4.1.0
# via -r requirements/test/../common.txt
python-multipart==0.0.22
python-multipart==0.0.26
# via
# fastapi
# mcp
@@ -1180,7 +1179,6 @@ requests==2.32.5
# google-api-core
# google-cloud-storage
# gpt-oss
# huggingface-hub
# lightly
# lm-eval
# mistral-common
@@ -1194,7 +1192,6 @@ requests==2.32.5
# starlette-testclient
# tacoreader
# tiktoken
# transformers
# wandb
resampy==0.4.3
# via -r requirements/test/rocm.in
@@ -1428,7 +1425,7 @@ timm==1.0.17
# segmentation-models-pytorch
# terratorch
# torchgeo
tokenizers==0.22.0
tokenizers==0.22.2
# via
# -c requirements/common.txt
# -r requirements/test/../common.txt
@@ -1471,7 +1468,7 @@ tqdm==4.67.3
# tacoreader
# terratorch
# transformers
transformers==4.57.5
transformers==5.5.3
# via
# -c requirements/common.txt
# -r requirements/test/../common.txt
@@ -1498,7 +1495,9 @@ typer==0.24.1
# fastapi-cli
# fastapi-cloud-cli
# fastsafetensors
# huggingface-hub
# perceptron
# transformers
typeshed-client==2.9.0
# via jsonargparse
typing-extensions==4.15.0
-1
View File
@@ -13,7 +13,6 @@ pytest-shard
absl-py
accelerate
arctic-inference
hf_transfer
lm_eval[api]
modelscope
+15 -9
View File
@@ -19,7 +19,9 @@ aiosignal==1.4.0
albumentations==1.4.6
# via -r requirements/test/xpu.in
annotated-doc==0.0.4
# via fastapi
# via
# fastapi
# typer
annotated-types==0.7.0
# via pydantic
anyio==4.13.0
@@ -64,6 +66,7 @@ click==8.3.1
# jiwer
# nltk
# schemathesis
# typer
# uvicorn
colorama==0.4.6
# via sacrebleu
@@ -112,7 +115,6 @@ filelock==3.25.2
# huggingface-hub
# modelscope
# torch
# transformers
frozenlist==1.8.0
# via
# aiohttp
@@ -133,9 +135,7 @@ h11==0.16.0
# uvicorn
harfile==0.4.0
# via schemathesis
hf-transfer==0.1.9
# via -r requirements/test/xpu.in
hf-xet==1.4.2
hf-xet==1.4.3
# via huggingface-hub
html2text==2025.4.15
# via gpt-oss
@@ -144,8 +144,9 @@ httpcore==1.0.9
httpx==0.28.1
# via
# datasets
# huggingface-hub
# schemathesis
huggingface-hub==0.36.2
huggingface-hub==1.10.2
# via
# accelerate
# datasets
@@ -515,7 +516,6 @@ requests==2.33.1
# docker
# evaluate
# gpt-oss
# huggingface-hub
# lm-eval
# mistral-common
# modelscope
@@ -524,11 +524,11 @@ requests==2.33.1
# schemathesis
# starlette-testclient
# tiktoken
# transformers
rich==14.3.3
# via
# mteb
# schemathesis
# typer
rouge-score==0.1.2
# via lm-eval
rpds-py==0.30.0
@@ -572,6 +572,8 @@ setuptools==80.10.2
# modelscope
# pytablewriter
# torch
shellingham==1.5.4
# via typer
six==1.17.0
# via
# -c requirements/common.txt
@@ -665,7 +667,7 @@ tqdm==4.67.3
# pqdm
# sentence-transformers
# transformers
transformers==4.57.6
transformers==5.5.3
# via
# -c requirements/common.txt
# sentence-transformers
@@ -676,6 +678,10 @@ typepy==1.3.4
# dataproperty
# pytablewriter
# tabledata
typer==0.24.1
# via
# huggingface-hub
# transformers
typing-extensions==4.15.0
# via
# -c requirements/common.txt
+9
View File
@@ -410,6 +410,15 @@ class HfRunner:
model_name,
trust_remote_code=trust_remote_code,
)
# HF runner should use the HF config so that it's consistent with the HF model
if self.config.__module__.startswith("vllm.transformers_utils.configs"):
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
del CONFIG_MAPPING._extra_content[self.config.model_type]
self.config = AutoConfig.from_pretrained(
model_name,
trust_remote_code=trust_remote_code,
)
self.device = self.get_default_device()
self.dtype = dtype = _get_and_verify_dtype(
self.model_name,
+6
View File
@@ -3,6 +3,7 @@
import tempfile
from collections import OrderedDict
from importlib import reload
from unittest.mock import MagicMock
import pytest
@@ -47,6 +48,11 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
def maybe_enable_lora_dual_stream(monkeypatch: pytest.MonkeyPatch):
if current_platform.is_cuda():
monkeypatch.setenv("VLLM_LORA_ENABLE_DUAL_STREAM", "1")
import vllm.lora.layers.base_linear
if not hasattr(vllm.lora.layers.base_linear, "lora_linear_async"):
# Reload the module to ensure the environment variable takes effect.
reload(vllm.lora.layers.base_linear)
yield
+11
View File
@@ -1,7 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from importlib.metadata import version
import pytest
from packaging.version import Version
import vllm
from vllm.assets.image import ImageAsset
@@ -10,6 +13,14 @@ from vllm.platforms import current_platform
from ..utils import multi_gpu_test
pytestmark = pytest.mark.skipif(
Version("5.0") <= Version(version("transformers")),
reason=(
"MiniCPMV custom processor uses tokenizer.im_start_id which is not "
"available on TokenizersBackend in transformers v5.0+"
),
)
MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
PROMPT_TEMPLATE = (
-18
View File
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import tempfile
import huggingface_hub.constants
@@ -10,26 +9,10 @@ from huggingface_hub.utils import LocalEntryNotFoundError
from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf,
enable_hf_transfer,
maybe_remap_kv_scale_name,
)
def test_hf_transfer_auto_activation():
if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
# in case it is already set, we can't test the auto activation
pytest.skip("HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
enable_hf_transfer()
try:
# enable hf hub transfer if available
import hf_transfer # type: ignore # noqa
HF_TRANSFER_ACTIVE = True
except ImportError:
HF_TRANSFER_ACTIVE = False
assert huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == HF_TRANSFER_ACTIVE
def test_download_weights_from_hf():
with tempfile.TemporaryDirectory() as tmpdir:
# assert LocalEntryNotFoundError error is thrown
@@ -178,5 +161,4 @@ class TestMaybeRemapKvScaleName:
if __name__ == "__main__":
test_hf_transfer_auto_activation()
test_download_weights_from_hf()
@@ -143,6 +143,11 @@ def test_models(
# in parts of the operators
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
if current_platform.is_cpu() and model == "TitanML/tiny-mixtral":
# This untrained model is sensitive to the rounding error
# Fuse ops to reduce bfloat16 rounding
monkeypatch.setenv("VLLM_CPU_CI_ENV", "0")
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs
@@ -69,7 +69,10 @@ MODELS = [
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True,
# Skip: model's custom tokenizer on HF hub is incompatible with
# transformers v5 (sets attrs before super().__init__, triggering
# AttributeError on 'verbose' in __getattr__).
enable_test=False,
),
]
@@ -72,7 +72,8 @@ MODELS = [
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
# Skip: numerical regression with transformers v5.
enable_test=False,
),
########## ModernBertModel
EmbedModelInfo(
@@ -75,6 +75,10 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(vllm_runner, model_info)
@pytest.mark.skip(
reason="jinaai/jina-embeddings-v3 custom XLMRobertaLoRA model on HF hub "
"is incompatible with transformers v5 (missing all_tied_weights_keys)"
)
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("dimensions", [16, 32])
@@ -186,7 +186,14 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2,
auto_cls=AutoModel,
hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
marks=[
pytest.mark.core_model,
pytest.mark.cpu_model,
# TODO: Remove skip once model has been upstreamed to Transformers
pytest.mark.skip(
reason="Custom model code is not compatible with Transformers v5"
),
],
),
#### Transformers fallback to test
## To reduce test burden, we only test batching arbitrary image size
@@ -397,14 +404,14 @@ VLM_TEST_SETTINGS = {
"gemma4": VLMTestInfo(
models=["google/gemma-4-E2B-it"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
prompt_formatter=lambda img_prompt: f"<bos><|turn>user\n{img_prompt}<turn|>\n<|turn>model\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "What's the content in the center of the image?",
"cherry_blossom": "What is the season?",
"stop_sign": "<|image|>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<|image|>What is the season?",
}
),
multi_image_prompt="Describe the two images in detail.",
multi_image_prompt="<|image|><|image|>Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
@@ -533,6 +540,12 @@ VLM_TEST_SETTINGS = {
max_model_len=4096,
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
# TODO: Remove skip once model has been upstreamed to Transformers
marks=[
pytest.mark.skip(
reason="Custom model code tries to access data from meta-tensor"
)
],
),
"intern_vl-video": VLMTestInfo(
models=[
@@ -545,6 +558,12 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
num_logprobs=10 if current_platform.is_rocm() else 5,
# TODO: Remove skip once model has been upstreamed to Transformers
marks=[
pytest.mark.skip(
reason="Custom model code tries to access data from meta-tensor"
)
],
),
"intern_vl-hf": VLMTestInfo(
models=["OpenGVLab/InternVL3-1B-hf"],
@@ -591,6 +610,8 @@ VLM_TEST_SETTINGS = {
hf_model_kwargs={"device_map": "auto"},
patch_hf_runner=model_utils.isaac_patch_hf_runner,
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
# TODO: Remove skip once model has been upstreamed to Transformers
marks=[pytest.mark.skip(reason="Custom model imports deleted object")], # noqa: E501
),
"kimi_vl": VLMTestInfo(
models=["moonshotai/Kimi-VL-A3B-Instruct"],
@@ -806,7 +827,12 @@ VLM_TEST_SETTINGS = {
pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
reason="This model is broken in Transformers v4.57.3",
)
),
pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) >= Version("5.0.0"),
reason="Model's custom code uses ROPE_INIT_FUNCTIONS"
"['default'] which was removed in transformers v5",
),
],
),
"phi3v": VLMTestInfo(
@@ -960,6 +986,12 @@ VLM_TEST_SETTINGS = {
)
for inp in custom_inputs.different_patch_input_cases_internvl()
],
# TODO: Remove skip once model has been upstreamed to Transformers
marks=[
pytest.mark.skip(
reason="Custom model code tries to access data from meta-tensor"
)
],
),
"llava_onevision-multiple-images": VLMTestInfo(
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
@@ -103,6 +103,10 @@ def run_test(
)
@pytest.mark.skip(
reason="Model's custom MBart decoder has head count mismatch with "
"transformers v5's GQA-aware cross-attention (8 vs 16 heads)"
)
@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("num_logprobs", [5])
@@ -2,9 +2,11 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from importlib.metadata import version
import pytest
import regex as re
from packaging.version import Version
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm.logprobs import SampleLogprobs
@@ -19,6 +21,15 @@ from ....conftest import (
from ....utils import multi_gpu_test
from ...utils import check_logprobs_close
pytestmark = pytest.mark.skipif(
Version("5.0") <= Version(version("transformers")),
reason=(
"vllm upgraded transformers above v5.4 where HF model custom code uses siglip2 "
"internals (filter_out_non_signature_kwargs) removed by "
"huggingface/transformers#43514"
),
)
MODEL_ID = "microsoft/Phi-4-reasoning-vision-15B"
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
@@ -149,6 +149,10 @@ def test_online_serving(vllm_runner, audio_assets: AudioTestAssets):
)
@pytest.mark.skip(
reason="VoxtralProcessor.apply_chat_template() in transformers v5 "
"doesn't resolve chat_template=None to the default template"
)
def test_hf_reference(hf_runner, vllm_runner, audio_assets: AudioTestAssets):
"""Compare vLLM Mistral-format output against HF Transformers reference.
@@ -80,6 +80,11 @@ def run_test(
if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs)
# Avoid passing limit_mm_per_prompt twice when vllm_runner_kwargs
# already contains it (e.g. gemma4 sets it via vllm_runner_kwargs).
if "limit_mm_per_prompt" in vllm_runner_kwargs_:
limit_mm_per_prompt = vllm_runner_kwargs_.pop("limit_mm_per_prompt")
with vllm_runner(
model,
max_model_len=max_model_len,
@@ -22,6 +22,11 @@ from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
from ....conftest import VllmRunner
pytestmark = pytest.mark.skip(
reason="ColQwen3 model's weight tying is incompatible with "
"transformers v5 (missing all_tied_weights_keys)"
)
MODELS = [
"TomoroAI/tomoro-colqwen3-embed-4b",
"OpenSearch-AI/Ops-Colqwen3-4B",
@@ -12,6 +12,11 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import ImageTestAssets
pytestmark = pytest.mark.skip(
reason="InternVisionModel's custom code is incompatible with "
"transformers v5 (missing all_tied_weights_keys)"
)
# we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
@@ -15,6 +15,11 @@ from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
from ....conftest import HfRunner, VllmRunner
pytestmark = pytest.mark.skip(
reason="jinaai/jina-reranker-m0 custom code is incompatible with "
"transformers v5 (missing all_tied_weights_keys)"
)
MODELS = ["jinaai/jina-reranker-m0"]
MM_PROCESSOR_KWARGS = {
@@ -17,11 +17,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from importlib.metadata import version
from unittest.mock import MagicMock
import numpy as np
import pytest
import torch
from packaging.version import Version
from transformers import PretrainedConfig
from tests.models.registry import HF_EXAMPLE_MODELS
@@ -122,6 +124,11 @@ def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx):
assert builder.get_dummy_text({"audio": 2}) == "<sound><sound>"
@pytest.mark.skipif(
Version(version("transformers")) >= Version("5.5"),
reason="transformers v5.5 added native MusicFlamingoForConditionalGeneration "
"with a different get_audio_features signature (requires input_ids)",
)
def test_musicflamingo_audio_feature_pipeline_matches_hf_small_config():
from transformers.models.musicflamingo import (
modeling_musicflamingo as hf_musicflamingo_modeling,
+130 -9
View File
@@ -335,7 +335,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"internlm/internlm2-chat-7b", trust_remote_code=True
),
"InternLM2VEForCausalLM": _HfExamplesInfo(
"OpenGVLab/Mono-InternVL-2B", trust_remote_code=True
"OpenGVLab/Mono-InternVL-2B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": (
"Custom config cannot be loaded with Transformers "
"v5 because `vision_config` is not always set"
)
},
),
"InternLM3ForCausalLM": _HfExamplesInfo(
"internlm/internlm3-8b-instruct", trust_remote_code=True
@@ -475,6 +483,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Plamo2ForCausalLM": _HfExamplesInfo(
"pfnet/plamo-2-1b",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"hf": (
"Custom model code uses `_tied_weight_keys: list[str]` but "
"Transformers v5 now expects `_tied_weight_keys: dict[str, str]`"
)
},
),
"Plamo3ForCausalLM": _HfExamplesInfo(
"pfnet/plamo-3-nict-2b-base",
@@ -515,6 +530,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True,
max_model_len=4096,
is_available_online=True,
max_transformers_version="5.3",
transformers_version_reason={
"vllm": (
"vllm upgraded transformers above v5.4 where "
"validate_rope() no longer accepts ignore_keys param"
)
},
),
"SeedOssForCausalLM": _HfExamplesInfo(
"ByteDance-Seed/Seed-OSS-36B-Instruct",
@@ -553,6 +575,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"xverse/XVERSE-7B-Chat",
tokenizer="meta-llama/Llama-2-7b",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "XVERSE tokenizer is incompatible with transformers v5 "
"(add_prefix_space / prepend_scheme mismatch).",
},
),
"Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
"MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True),
@@ -763,10 +790,18 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# [Decoder-only]
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
"AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
"nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0"
"nvidia/audio-flamingo-3-hf",
min_transformers_version="5.3.0",
transformers_version_reason={
"vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
},
),
"MusicFlamingoForConditionalGeneration": _HfExamplesInfo(
"nvidia/music-flamingo-2601-hf", min_transformers_version="5.3.0"
"nvidia/music-flamingo-2601-hf",
min_transformers_version="5.3.0",
transformers_version_reason={
"vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
},
),
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
"BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
@@ -821,12 +856,30 @@ _MULTIMODAL_EXAMPLE_MODELS = {
),
"FireRedASR2ForConditionalGeneration": _HfExamplesInfo(
"allendou/FireRedASR2-LLM-vllm",
trust_remote_code=True,
max_transformers_version="5.1",
transformers_version_reason={
"vllm": "Incompatible with transformers v5.2+ "
"(dict object has no attribute '__name__').",
},
),
"FireRedLIDForConditionalGeneration": _HfExamplesInfo(
"PatchyTisa/FireRedLID-vllm",
trust_remote_code=True,
max_transformers_version="5.1",
transformers_version_reason={
"vllm": "Incompatible with transformers v5.2+ "
"(dict object has no attribute '__name__').",
},
),
"FunASRForConditionalGeneration": _HfExamplesInfo(
"allendou/Fun-ASR-Nano-2512-vllm",
trust_remote_code=True,
max_transformers_version="5.1",
transformers_version_reason={
"vllm": "Incompatible with transformers v5.2+ "
"(dict object has no attribute '__name__').",
},
),
"FunAudioChatForConditionalGeneration": _HfExamplesInfo(
"funaudiochat", is_available_online=False
@@ -868,6 +921,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"HCXVisionForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": (
"Custom config cannot be loaded with Transformers "
"v5 because `text_config` is not always set"
)
},
),
"HCXVisionV2ForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
@@ -887,7 +947,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"},
),
"InternS1ForConditionalGeneration": _HfExamplesInfo(
"internlm/Intern-S1", trust_remote_code=True
"internlm/Intern-S1",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "Custom tokenizer code is not compatible with Transformers v5."
},
),
"InternS1ProForConditionalGeneration": _HfExamplesInfo(
"internlm/Intern-S1-Pro",
@@ -976,7 +1041,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"MiDashengLMModel": _HfExamplesInfo(
"mispeech/midashenglm-7b", trust_remote_code=True
),
"MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", trust_remote_code=True),
"MiniCPMO": _HfExamplesInfo(
"openbmb/MiniCPM-o-2_6",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"hf": "Custom processor code is not compatible with Transformers v5."
},
),
"MiniCPMV": _HfExamplesInfo(
"openbmb/MiniCPM-Llama3-V-2_5",
extras={
@@ -984,6 +1056,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"4.0": "openbmb/MiniCPM-V-4",
"4.5": "openbmb/MiniCPM-V-4_5",
},
max_transformers_version="4.57",
transformers_version_reason={
"vllm": (
"MiniCPMVBatchFeature is incompatible with its base class in "
"Transformers v5. See https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/discussions/78"
)
},
trust_remote_code=True,
),
"MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo(
@@ -1083,13 +1162,25 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True,
),
"OpenCUAForConditionalGeneration": _HfExamplesInfo(
"xlangai/OpenCUA-7B", trust_remote_code=True
"xlangai/OpenCUA-7B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "Tokenizer cannot be initialised in Transformers v5."
},
),
"OpenPanguVLForConditionalGeneration": _HfExamplesInfo(
"FreedomIntelligence/openPangu-VL-7B",
trust_remote_code=True,
max_model_len=4096,
enforce_eager=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": (
"OpenPanguVLVideoProcessorInitKwargs does not specify total=False, "
"making all kwargs required. See https://huggingface.co/FreedomIntelligence/openPangu-VL-7B/discussions/2"
)
},
),
"Ovis": _HfExamplesInfo(
"AIDC-AI/Ovis2-1B",
@@ -1101,12 +1192,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B",
},
),
"Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True),
"Ovis2_5": _HfExamplesInfo(
"AIDC-AI/Ovis2.5-2B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "Custom processor code is not compatible with Transformers v5."
},
),
"Ovis2_6ForCausalLM": _HfExamplesInfo(
"AIDC-AI/Ovis2.6-2B", is_available_online=False, trust_remote_code=True
),
"Ovis2_6_MoeForCausalLM": _HfExamplesInfo(
"AIDC-AI/Ovis2.6-30B-A3B", trust_remote_code=True
"AIDC-AI/Ovis2.6-30B-A3B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "Custom processor code is not compatible with Transformers v5."
},
),
"PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
"PaddlePaddle/PaddleOCR-VL",
@@ -1126,7 +1229,17 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"},
),
"Phi4ForCausalLMV": _HfExamplesInfo(
"microsoft/Phi-4-reasoning-vision-15B", trust_remote_code=True
"microsoft/Phi-4-reasoning-vision-15B",
trust_remote_code=True,
max_transformers_version="5.3",
transformers_version_reason={
"vllm": (
"vllm upgraded transformers above v5.4 where HF model "
"custom code uses siglip2 internals "
"(filter_out_non_signature_kwargs) removed "
"by huggingface/transformers#43514"
)
},
),
"Phi4MMForCausalLM": _HfExamplesInfo(
"microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
@@ -1223,6 +1336,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"architectures": ["Tarsier2ForConditionalGeneration"],
"model_type": "tarsier2",
},
max_transformers_version="5.3",
transformers_version_reason={
"vllm": (
"Qwen2VLConfig was split into Qwen2VLConfig + "
"Qwen2VLTextConfig in transformers v5, breaking "
"attribute access (num_attention_heads, hidden_size, etc.)"
)
},
),
"VoxtralForConditionalGeneration": _HfExamplesInfo(
"mistralai/Voxtral-Mini-3B-2507",
+10 -1
View File
@@ -476,7 +476,16 @@ def dummy_hf_overrides(
else:
# Use minimal layers for testing
num_layers = 1
num_hidden_layers = 3 if model_arch == "Gemma3nForConditionalGeneration" else 1
num_hidden_layers = (
3
if model_arch
in (
"Gemma3nForConditionalGeneration",
"Gemma4ForCausalLM",
"Gemma4ForConditionalGeneration",
)
else 1
)
update_dict = {
"num_layers": num_layers,
@@ -2,10 +2,10 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.tokenizers import get_tokenizer
parser_name = "step3p5"
start_token = "<think>"
@@ -16,7 +16,7 @@ REASONING_MODEL_NAME = "stepfun-ai/Step-3.5-Flash"
@pytest.fixture(scope="module")
def step3p5_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME)
SIMPLE_REASONING = {
+5 -1
View File
@@ -557,12 +557,16 @@ def test_eagle_correctness_light(
"auto",
0.8,
),
(
pytest.param(
("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1),
False,
False,
"transformers",
0.8,
# TODO(hmellor): figure out why memory usage is so high
marks=pytest.mark.skip(
reason="Feature is experimental and uses too much memory in CI",
),
),
pytest.param(
(
@@ -265,12 +265,24 @@ class GGUFModelLoader(BaseModelLoader):
GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight')
or None if no mapping found
"""
# In transformers v5, multimodal models (e.g. Gemma3) wrap
# all sub-models under an outer 'model.' attribute, producing
# state_dict keys like 'model.language_model.layers.0...' and
# 'model.vision_tower.vision_model...'. Strip this outer
# prefix so the keys match what gguf-py expects.
if is_multimodal and hf_name.startswith("model."):
hf_name = hf_name[6:] # Remove outer 'model.'
# Strip 'language_model.' prefix for multimodal models - gguf-py
# tensor mappings expect parameter names without this prefix.
# Note: 'model.' prefix should be KEPT for text-only models as
# gguf-py expects it.
if hf_name.startswith("language_model."):
hf_name = hf_name[15:] # Remove 'language_model.'
# Re-add 'model.' prefix because gguf-py text tensor maps
# expect 'model.layers...' format.
if is_multimodal:
hf_name = "model." + hf_name
# Parse parameter name and suffix
if hf_name.endswith((".weight", ".bias")):
+36 -15
View File
@@ -125,8 +125,12 @@ class Gemma4AudioInputs(TensorSchema):
"""
type: Literal["audio"] = "audio"
input_features_padded: Annotated[torch.Tensor, TensorShape("bn", "s", "f")]
input_features_mask: Annotated[torch.Tensor, TensorShape("bn", "s")]
input_features_padded: Annotated[
torch.Tensor, TensorShape("bn", "s", "f", dynamic_dims={"s"})
]
input_features_mask: Annotated[
torch.Tensor, TensorShape("bn", "s", dynamic_dims={"s"})
]
Gemma4ImageInputs = Gemma4ImagePixelInputs
@@ -510,6 +514,8 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
video_timestamps_per_video: list[list[float]] = []
video_frame_counts: list[int] = []
video_replacements: list[str] = []
for item in videos:
video_array, metadata = item
@@ -562,10 +568,7 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
video_timestamps_per_video.append(timestamps)
video_frame_counts.append(len(frames))
# Build expanded replacement text and replace the
# <|video|> placeholder in the prompt.
# Use split(token, 1) to avoid collision — the
# replacement text itself contains <|video|> tokens.
# Build expanded replacement text for this video.
ts_strs = [f"{int(s // 60):02d}:{int(s % 60):02d}" for s in timestamps]
replacement = " ".join(
f"{t} {processor.boi_token}"
@@ -573,9 +576,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
f"{processor.eoi_token}"
for t, n in zip(ts_strs, num_soft_per_frame)
)
parts = prompt.split(processor.video_token, 1)
if len(parts) == 2:
prompt = parts[0] + replacement + parts[1]
video_replacements.append(replacement)
# Replace all <|video|> placeholders at once. We split on
# video_token to get N+1 parts, then interleave with the
# N replacement strings. This avoids the iterative
# split-replace bug where replacement text (which itself
# contains <|video|> tokens) collides with later splits.
vt = processor.video_token
parts = prompt.split(vt, len(video_replacements))
# NOTE: len(parts) <= len(video_replacements) + 1
parts_with_repl: list[str] = []
for part, repl in zip(parts, video_replacements):
parts_with_repl.extend([part, repl])
parts_with_repl.extend(parts[len(video_replacements) :])
prompt = "".join(parts_with_repl)
video_outputs = {
"pixel_values_videos": torch.cat(all_video_pixel_values, dim=0),
@@ -638,19 +655,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
)
if "input_features" in processed_outputs:
# Keep padded features for batched audio tower execution.
processed_outputs["input_features_padded"] = processed_outputs[
"input_features"
]
# Unpad per-item so each item's cache entry is self-contained.
# Unpad per-item so each item's cache entry is
# self-contained. The batched() field config in
# _get_mm_fields_config will re-pad all fields to the
# batch's max length at batch time, ensuring consistent
# padding regardless of cache history.
masks = processed_outputs["input_features_mask"]
unpadded_features = [
f[mask]
for f, mask in zip(
processed_outputs["input_features"],
processed_outputs["input_features_mask"],
masks,
)
]
unpadded_masks = [mask[mask] for mask in masks]
processed_outputs["input_features"] = unpadded_features
processed_outputs["input_features_padded"] = unpadded_features
processed_outputs["input_features_mask"] = unpadded_masks
# Merge video outputs into the final result
combined_outputs = dict(processed_outputs, **video_outputs)
+1 -1
View File
@@ -32,9 +32,9 @@ from transformers.models.musicflamingo import (
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
@@ -275,6 +275,11 @@ class Base(
)
class SupportTorchCompileWrapper(cls): ...
# Preserve __module__ so transformers v5's source-file checks
# (e.g. _can_set_experts_implementation) read the original
# model's module instead of this file.
SupportTorchCompileWrapper.__module__ = cls.__module__
# Patch the class in its module
module = sys.modules[cls.__module__]
setattr(module, cls.__name__, SupportTorchCompileWrapper)
+34 -1
View File
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
from dataclasses import dataclass, field
from functools import lru_cache
from pathlib import Path
@@ -10,6 +11,7 @@ from typing_extensions import TypeVar, assert_never
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.transformers_utils.config import get_config
from vllm.transformers_utils.gguf_utils import (
check_gguf_file,
get_gguf_file_path_from_hf,
@@ -31,6 +33,13 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
# Model types whose hub tokenizer_class is incorrect and should be overridden with
# TokenizersBackend (the generic fast tokenizer). Adding a model type here is always a
# temporary workaround and better long term solutions are:
# - Add model type to MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS in transformers (better)
# - Fix tokenizer_class on the hub for the affected models (best)
_MODEL_TYPES_WITH_INCORRECT_TOKENIZER_CLASS: set[str] = {"step3_vl"}
_VLLM_TOKENIZERS = {
"deepseek_v32": ("deepseek_v32", "DeepseekV32Tokenizer"),
"grok2": ("grok2", "Grok2Tokenizer"),
@@ -202,7 +211,31 @@ def get_tokenizer(
**kwargs,
)
if tokenizer_cls == TokenizerLike:
# Ensure that, if the config were to come from vllm.transformers_utils.config, it is
# registered with AutoConfig before the tokenizer is loaded. This is necessary since
# tokenizer_cls_.from_pretrained will call AutoConfig.from_pretrained internally.
# This may fail for paths that don't have a model config (e.g. LoRA adapters),
# which is fine — those don't need custom config registration.
config = None
with contextlib.suppress(ValueError, OSError):
config = get_config(
tokenizer_name,
trust_remote_code=trust_remote_code,
revision=revision,
)
# Some models have an incorrect tokenizer_class on the hub.
# For these model types, bypass AutoTokenizer and use TokenizersBackend directly.
model_type = getattr(config, "model_type", None) if config else None
if model_type in _MODEL_TYPES_WITH_INCORRECT_TOKENIZER_CLASS:
from transformers.tokenization_utils_tokenizers import TokenizersBackend
logger.debug(
"Overriding tokenizer_class to TokenizersBackend for model_type=%r",
model_type,
)
tokenizer_cls_ = TokenizersBackend
elif tokenizer_cls == TokenizerLike:
tokenizer_cls_ = TokenizerRegistry.load_tokenizer_cls(tokenizer_mode)
else:
tokenizer_cls_ = tokenizer_cls