Update to transformers v5 (#30566)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: khluu <khluu000@gmail.com> Signed-off-by: Kevin H. Luu <khluu000@gmail.com> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: khluu <khluu000@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: jiang1.li <jiang1.li@intel.com>
2026-06-06 00:16:14 +00:00 · 2026-04-16 00:29:15 +01:00
parent 6dc9491406
commit 03f8d3a548
41 changed files with 445 additions and 115 deletions
@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
 docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
+docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 -e VLLM_CPU_ATTN_SPLIT_KV=0 --shm-size=4g "$IMAGE_NAME" \
        timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
@@ -4,7 +4,6 @@ depends_on:
 steps:
 - label: Basic Models Tests (Initialization)
  timeout_in_minutes: 45
-  device: h200_18gb
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -73,3 +72,18 @@ steps:
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
+
+- label: Transformers Backward Compatibility Models Test
+  working_dir: "/vllm-workspace/"
+  optional: true
+  soft_fail: true
+  commands:
+    - pip install transformers==4.57.5
+    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_transformers.py
+    - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # Whisper needs spawn method to avoid deadlock
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
@@ -642,7 +642,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    else \
        BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
    fi; \
-    uv pip install --system accelerate hf_transfer modelscope \
+    uv pip install --system accelerate modelscope \
        "bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}"

 # ============================================================
@@ -756,9 +756,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -e tests/vllm_test_utils

 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER 1
+ENV HF_XET_HIGH_PERFORMANCE 1
+
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60

 # Copy in the v1 package for testing (it isn't distributed yet)
 COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
@@ -197,6 +197,12 @@ ADD ./.buildkite/ ./.buildkite/
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -e tests/vllm_test_utils

+# enable fast downloads from hf (for testing)
+ENV HF_XET_HIGH_PERFORMANCE 1
+
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60
+
 ######################### RELEASE IMAGE #########################
 FROM base AS vllm-openai

@@ -272,9 +272,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -e tests/vllm_test_utils

 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER 1
+ENV HF_XET_HIGH_PERFORMANCE 1
+
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60

 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/test/nightly-torch.txt
@@ -365,9 +365,10 @@ RUN cd /vllm-workspace \
    && python3 -m pip install pytest-shard

 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER=1
+ENV HF_XET_HIGH_PERFORMANCE=1
+
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60

 # install audio decode package `torchcodec` from source (required due to 
 # ROCm and torch version mismatch) for tests with datasets package
@@ -240,7 +240,7 @@ uv pip install vllm==${VLLM_VERSION} \
        # Install dependencies
        pip install --upgrade numba \
            scipy \
-            huggingface-hub[cli,hf_transfer] \
+            huggingface-hub[cli] \
            setuptools_scm
        pip install -r requirements/rocm.txt

@@ -7,7 +7,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.56.0, < 5
+transformers >= 4.56.0, != 5.0.*, != 5.1.*, != 5.2.*, != 5.3.*, != 5.4.*, != 5.5.0
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
@@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.14.0.1 # required for compressed-tensors
+compressed-tensors == 0.15.0.1 # required for compressed-tensors
 depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
@@ -18,7 +18,7 @@ httpx
 librosa # required for audio tests
 vector_quantize_pytorch # required for minicpmo_26 test
 vocos # required for minicpmo_26 test
-peft>=0.15.0 # required for phi-4-mm test
+peft>=0.18.1 # required for phi-4-mm test
 pqdm
 ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
 resampy # required for audio tests
@@ -39,8 +39,8 @@ opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.5
-tokenizers==0.22.0
+transformers==5.5.3
+tokenizers==0.22.2
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes==0.49.2
@@ -4,7 +4,7 @@ absl-py==2.1.0
    # via
    #   rouge-score
    #   tensorboard
-accelerate==1.0.1
+accelerate==1.13.0
    # via peft
 aenum==3.1.16
    # via lightly
@@ -248,7 +248,6 @@ filelock==3.16.1
    #   huggingface-hub
    #   ray
    #   torch
-    #   transformers
    #   virtualenv
 fiona==1.10.1
    # via torchgeo
@@ -331,7 +330,7 @@ h5py==3.13.0
    # via terratorch
 harfile==0.3.0
    # via schemathesis
-hf-xet==1.1.7
+hf-xet==1.4.3
    # via huggingface-hub
 hiredis==3.0.0
    # via tensorizer
@@ -345,9 +344,10 @@ httpx==0.27.2
    # via
    #   -r requirements/test/cuda.in
    #   diffusers
+    #   huggingface-hub
    #   perceptron
    #   schemathesis
-huggingface-hub==0.36.2
+huggingface-hub==1.10.2
    # via
    #   accelerate
    #   datasets
@@ -756,7 +756,7 @@ pathvalidate==3.2.1
    # via pytablewriter
 patsy==1.0.1
    # via statsmodels
-peft==0.16.0
+peft==0.18.1
    # via -r requirements/test/cuda.in
 perceptron==0.1.4
    # via -r requirements/test/cuda.in
@@ -982,7 +982,7 @@ referencing==0.35.1
    # via
    #   jsonschema
    #   jsonschema-specifications
-regex==2024.9.11
+regex==2026.2.28
    # via
    #   diffusers
    #   nltk
@@ -1002,7 +1002,6 @@ requests==2.32.3
    #   google-api-core
    #   google-cloud-storage
    #   gpt-oss
-    #   huggingface-hub
    #   lightly
    #   lm-eval
    #   mistral-common
@@ -1015,7 +1014,6 @@ requests==2.32.3
    #   starlette-testclient
    #   tacoreader
    #   tiktoken
-    #   transformers
    #   wandb
 resampy==0.4.3
    # via -r requirements/test/cuda.in
@@ -1216,7 +1214,7 @@ timm==1.0.17
    #   segmentation-models-pytorch
    #   terratorch
    #   torchgeo
-tokenizers==0.22.0
+tokenizers==0.22.2
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/cuda.in
@@ -1295,7 +1293,7 @@ tqdm==4.67.3
    #   tacoreader
    #   terratorch
    #   transformers
-transformers==4.57.5
+transformers==5.5.3
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/cuda.in
@@ -1317,7 +1315,9 @@ typepy==1.3.2
 typer==0.15.2
    # via
    #   fastsafetensors
+    #   huggingface-hub
    #   perceptron
+    #   transformers
 types-python-dateutil==2.9.0.20241206
    # via arrow
 typeshed-client==2.8.2
@@ -29,8 +29,8 @@ opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.5
-tokenizers==0.22.0
+transformers==5.5.3
+tokenizers==0.22.2
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes>=0.49.2
@@ -38,8 +38,8 @@ opencv-python-headless>=4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.5
-tokenizers==0.22.0
+transformers==5.5.3
+tokenizers==0.22.2
 schemathesis>=3.39.15 # Required for openai schema test
 # quantization
 bitsandbytes==0.49.2
@@ -82,4 +82,3 @@ plotly # required for perf comparison html report
 rapidfuzz
 torchgeo==0.7.0
 multiprocess==0.70.16
-huggingface-hub==0.36.2
@@ -39,7 +39,7 @@ annotated-doc==0.0.4
    #   typer
 annotated-types==0.7.0
    # via pydantic
-anthropic==0.89.0
+anthropic==0.93.0
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/../common.txt
@@ -172,7 +172,7 @@ colorful==0.5.8
    # via ray
 colorlog==6.10.1
    # via optuna
-compressed-tensors==0.14.0.1
+compressed-tensors==0.15.0.1
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/../common.txt
@@ -269,9 +269,9 @@ fastapi==0.135.2
    #   model-hosting-container-standards
 fastapi-cli==0.0.24
    # via fastapi
-fastapi-cloud-cli==0.15.1
+fastapi-cloud-cli==0.16.1
    # via fastapi-cli
-fastar==0.9.0
+fastar==0.10.0
    # via fastapi-cloud-cli
 fastparquet==2026.3.0
    # via genai-perf
@@ -290,7 +290,6 @@ filelock==3.25.2
    #   python-discovery
    #   ray
    #   torch
-    #   transformers
    #   virtualenv
 fiona==1.10.1
    # via torchgeo
@@ -384,7 +383,7 @@ h5py==3.16.0
    # via terratorch
 harfile==0.4.0
    # via schemathesis
-hf-xet==1.4.2
+hf-xet==1.4.3
    # via huggingface-hub
 hiredis==3.3.1
    # via tensorizer
@@ -403,6 +402,7 @@ httpx==0.27.2
    #   diffusers
    #   fastapi
    #   fastapi-cloud-cli
+    #   huggingface-hub
    #   mcp
    #   model-hosting-container-standards
    #   openai
@@ -410,9 +410,8 @@ httpx==0.27.2
    #   schemathesis
 httpx-sse==0.4.3
    # via mcp
-huggingface-hub==0.36.2
+huggingface-hub==1.10.2
    # via
-    #   -r requirements/test/rocm.in
    #   accelerate
    #   datasets
    #   diffusers
@@ -484,7 +483,7 @@ jinja2==3.1.6
    #   genai-perf
    #   lm-eval
    #   torch
-jiter==0.13.0
+jiter==0.14.0
    # via
    #   anthropic
    #   openai
@@ -631,7 +630,7 @@ msgpack==1.1.2
    # via
    #   librosa
    #   ray
-msgspec==0.20.0
+msgspec==0.21.0
    # via -r requirements/test/../common.txt
 mteb==2.11.5
    # via -r requirements/test/rocm.in
@@ -742,7 +741,7 @@ omegaconf==2.3.0
    #   lightning
 open-clip-torch==2.32.0
    # via -r requirements/test/rocm.in
-openai==2.30.0
+openai==2.31.0
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/../common.txt
@@ -1093,7 +1092,7 @@ python-dotenv==1.2.2
    #   uvicorn
 python-json-logger==4.1.0
    # via -r requirements/test/../common.txt
-python-multipart==0.0.22
+python-multipart==0.0.26
    # via
    #   fastapi
    #   mcp
@@ -1180,7 +1179,6 @@ requests==2.32.5
    #   google-api-core
    #   google-cloud-storage
    #   gpt-oss
-    #   huggingface-hub
    #   lightly
    #   lm-eval
    #   mistral-common
@@ -1194,7 +1192,6 @@ requests==2.32.5
    #   starlette-testclient
    #   tacoreader
    #   tiktoken
-    #   transformers
    #   wandb
 resampy==0.4.3
    # via -r requirements/test/rocm.in
@@ -1428,7 +1425,7 @@ timm==1.0.17
    #   segmentation-models-pytorch
    #   terratorch
    #   torchgeo
-tokenizers==0.22.0
+tokenizers==0.22.2
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/../common.txt
@@ -1471,7 +1468,7 @@ tqdm==4.67.3
    #   tacoreader
    #   terratorch
    #   transformers
-transformers==4.57.5
+transformers==5.5.3
    # via
    #   -c requirements/common.txt
    #   -r requirements/test/../common.txt
@@ -1498,7 +1495,9 @@ typer==0.24.1
    #   fastapi-cli
    #   fastapi-cloud-cli
    #   fastsafetensors
+    #   huggingface-hub
    #   perceptron
+    #   transformers
 typeshed-client==2.9.0
    # via jsonargparse
 typing-extensions==4.15.0
@@ -13,7 +13,6 @@ pytest-shard
 absl-py
 accelerate
 arctic-inference
-hf_transfer
 lm_eval[api]
 modelscope

@@ -19,7 +19,9 @@ aiosignal==1.4.0
 albumentations==1.4.6
    # via -r requirements/test/xpu.in
 annotated-doc==0.0.4
-    # via fastapi
+    # via
+    #   fastapi
+    #   typer
 annotated-types==0.7.0
    # via pydantic
 anyio==4.13.0
@@ -64,6 +66,7 @@ click==8.3.1
    #   jiwer
    #   nltk
    #   schemathesis
+    #   typer
    #   uvicorn
 colorama==0.4.6
    # via sacrebleu
@@ -112,7 +115,6 @@ filelock==3.25.2
    #   huggingface-hub
    #   modelscope
    #   torch
-    #   transformers
 frozenlist==1.8.0
    # via
    #   aiohttp
@@ -133,9 +135,7 @@ h11==0.16.0
    #   uvicorn
 harfile==0.4.0
    # via schemathesis
-hf-transfer==0.1.9
-    # via -r requirements/test/xpu.in
-hf-xet==1.4.2
+hf-xet==1.4.3
    # via huggingface-hub
 html2text==2025.4.15
    # via gpt-oss
@@ -144,8 +144,9 @@ httpcore==1.0.9
 httpx==0.28.1
    # via
    #   datasets
+    #   huggingface-hub
    #   schemathesis
-huggingface-hub==0.36.2
+huggingface-hub==1.10.2
    # via
    #   accelerate
    #   datasets
@@ -515,7 +516,6 @@ requests==2.33.1
    #   docker
    #   evaluate
    #   gpt-oss
-    #   huggingface-hub
    #   lm-eval
    #   mistral-common
    #   modelscope
@@ -524,11 +524,11 @@ requests==2.33.1
    #   schemathesis
    #   starlette-testclient
    #   tiktoken
-    #   transformers
 rich==14.3.3
    # via
    #   mteb
    #   schemathesis
+    #   typer
 rouge-score==0.1.2
    # via lm-eval
 rpds-py==0.30.0
@@ -572,6 +572,8 @@ setuptools==80.10.2
    #   modelscope
    #   pytablewriter
    #   torch
+shellingham==1.5.4
+    # via typer
 six==1.17.0
    # via
    #   -c requirements/common.txt
@@ -665,7 +667,7 @@ tqdm==4.67.3
    #   pqdm
    #   sentence-transformers
    #   transformers
-transformers==4.57.6
+transformers==5.5.3
    # via
    #   -c requirements/common.txt
    #   sentence-transformers
@@ -676,6 +678,10 @@ typepy==1.3.4
    #   dataproperty
    #   pytablewriter
    #   tabledata
+typer==0.24.1
+    # via
+    #   huggingface-hub
+    #   transformers
 typing-extensions==4.15.0
    # via
    #   -c requirements/common.txt
@@ -410,6 +410,15 @@ class HfRunner:
            model_name,
            trust_remote_code=trust_remote_code,
        )
+        # HF runner should use the HF config so that it's consistent with the HF model
+        if self.config.__module__.startswith("vllm.transformers_utils.configs"):
+            from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+
+            del CONFIG_MAPPING._extra_content[self.config.model_type]
+            self.config = AutoConfig.from_pretrained(
+                model_name,
+                trust_remote_code=trust_remote_code,
+            )
        self.device = self.get_default_device()
        self.dtype = dtype = _get_and_verify_dtype(
            self.model_name,
@@ -3,6 +3,7 @@

 import tempfile
 from collections import OrderedDict
+from importlib import reload
 from unittest.mock import MagicMock

 import pytest
@@ -47,6 +48,11 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
 def maybe_enable_lora_dual_stream(monkeypatch: pytest.MonkeyPatch):
    if current_platform.is_cuda():
        monkeypatch.setenv("VLLM_LORA_ENABLE_DUAL_STREAM", "1")
+        import vllm.lora.layers.base_linear
+
+        if not hasattr(vllm.lora.layers.base_linear, "lora_linear_async"):
+            # Reload the module to ensure the environment variable takes effect.
+            reload(vllm.lora.layers.base_linear)
    yield


@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from importlib.metadata import version
+
 import pytest
+from packaging.version import Version

 import vllm
 from vllm.assets.image import ImageAsset
@@ -10,6 +13,14 @@ from vllm.platforms import current_platform

 from ..utils import multi_gpu_test

+pytestmark = pytest.mark.skipif(
+    Version("5.0") <= Version(version("transformers")),
+    reason=(
+        "MiniCPMV custom processor uses tokenizer.im_start_id which is not "
+        "available on TokenizersBackend in transformers v5.0+"
+    ),
+)
+
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"

 PROMPT_TEMPLATE = (
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import os
 import tempfile

 import huggingface_hub.constants
@@ -10,26 +9,10 @@ from huggingface_hub.utils import LocalEntryNotFoundError

 from vllm.model_executor.model_loader.weight_utils import (
    download_weights_from_hf,
-    enable_hf_transfer,
    maybe_remap_kv_scale_name,
 )


-def test_hf_transfer_auto_activation():
-    if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
-        # in case it is already set, we can't test the auto activation
-        pytest.skip("HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
-    enable_hf_transfer()
-    try:
-        # enable hf hub transfer if available
-        import hf_transfer  # type: ignore # noqa
-
-        HF_TRANSFER_ACTIVE = True
-    except ImportError:
-        HF_TRANSFER_ACTIVE = False
-    assert huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == HF_TRANSFER_ACTIVE
-
-
 def test_download_weights_from_hf():
    with tempfile.TemporaryDirectory() as tmpdir:
        # assert LocalEntryNotFoundError error is thrown
@@ -178,5 +161,4 @@ class TestMaybeRemapKvScaleName:


 if __name__ == "__main__":
-    test_hf_transfer_auto_activation()
    test_download_weights_from_hf()
@@ -143,6 +143,11 @@ def test_models(
        # in parts of the operators
        pytest.skip(f"Skipping '{model}' model test with AITER kernel.")

+    if current_platform.is_cpu() and model == "TitanML/tiny-mixtral":
+        # This untrained model is sensitive to the rounding error
+        # Fuse ops to reduce bfloat16 rounding
+        monkeypatch.setenv("VLLM_CPU_CI_ENV", "0")
+
    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs
@@ -69,7 +69,10 @@ MODELS = [
        attn_type="decoder",
        is_prefix_caching_supported=True,
        is_chunked_prefill_supported=True,
-        enable_test=True,
+        # Skip: model's custom tokenizer on HF hub is incompatible with
+        # transformers v5 (sets attrs before super().__init__, triggering
+        # AttributeError on 'verbose' in __getattr__).
+        enable_test=False,
    ),
 ]

@@ -72,7 +72,8 @@ MODELS = [
        attn_type="encoder_only",
        is_prefix_caching_supported=False,
        is_chunked_prefill_supported=False,
-        enable_test=True,
+        # Skip: numerical regression with transformers v5.
+        enable_test=False,
    ),
    ########## ModernBertModel
    EmbedModelInfo(
@@ -75,6 +75,10 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
    mteb_test_rerank_models(vllm_runner, model_info)


+@pytest.mark.skip(
+    reason="jinaai/jina-embeddings-v3 custom XLMRobertaLoRA model on HF hub "
+    "is incompatible with transformers v5 (missing all_tied_weights_keys)"
+)
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("dimensions", [16, 32])
@@ -186,7 +186,14 @@ VLM_TEST_SETTINGS = {
        max_num_seqs=2,
        auto_cls=AutoModel,
        hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
-        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        marks=[
+            pytest.mark.core_model,
+            pytest.mark.cpu_model,
+            # TODO: Remove skip once model has been upstreamed to Transformers
+            pytest.mark.skip(
+                reason="Custom model code is not compatible with Transformers v5"
+            ),
+        ],
    ),
    #### Transformers fallback to test
    ## To reduce test burden, we only test batching arbitrary image size
@@ -397,14 +404,14 @@ VLM_TEST_SETTINGS = {
    "gemma4": VLMTestInfo(
        models=["google/gemma-4-E2B-it"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<bos><|turn>user\n{img_prompt}<turn|>\n<|turn>model\n",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts(
            {
-                "stop_sign": "What's the content in the center of the image?",
-                "cherry_blossom": "What is the season?",
+                "stop_sign": "<|image|>What's the content in the center of the image?",  # noqa: E501
+                "cherry_blossom": "<|image|>What is the season?",
            }
        ),
-        multi_image_prompt="Describe the two images in detail.",
+        multi_image_prompt="<|image|><|image|>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
@@ -533,6 +540,12 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
+        # TODO: Remove skip once model has been upstreamed to Transformers
+        marks=[
+            pytest.mark.skip(
+                reason="Custom model code tries to access data from meta-tensor"
+            )
+        ],
    ),
    "intern_vl-video": VLMTestInfo(
        models=[
@@ -545,6 +558,12 @@ VLM_TEST_SETTINGS = {
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
        num_logprobs=10 if current_platform.is_rocm() else 5,
+        # TODO: Remove skip once model has been upstreamed to Transformers
+        marks=[
+            pytest.mark.skip(
+                reason="Custom model code tries to access data from meta-tensor"
+            )
+        ],
    ),
    "intern_vl-hf": VLMTestInfo(
        models=["OpenGVLab/InternVL3-1B-hf"],
@@ -591,6 +610,8 @@ VLM_TEST_SETTINGS = {
        hf_model_kwargs={"device_map": "auto"},
        patch_hf_runner=model_utils.isaac_patch_hf_runner,
        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        # TODO: Remove skip once model has been upstreamed to Transformers
+        marks=[pytest.mark.skip(reason="Custom model imports deleted object")],  # noqa: E501
    ),
    "kimi_vl": VLMTestInfo(
        models=["moonshotai/Kimi-VL-A3B-Instruct"],
@@ -806,7 +827,12 @@ VLM_TEST_SETTINGS = {
            pytest.mark.skipif(
                Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
                reason="This model is broken in Transformers v4.57.3",
-            )
+            ),
+            pytest.mark.skipif(
+                Version(TRANSFORMERS_VERSION) >= Version("5.0.0"),
+                reason="Model's custom code uses ROPE_INIT_FUNCTIONS"
+                "['default'] which was removed in transformers v5",
+            ),
        ],
    ),
    "phi3v": VLMTestInfo(
@@ -960,6 +986,12 @@ VLM_TEST_SETTINGS = {
            )
            for inp in custom_inputs.different_patch_input_cases_internvl()
        ],
+        # TODO: Remove skip once model has been upstreamed to Transformers
+        marks=[
+            pytest.mark.skip(
+                reason="Custom model code tries to access data from meta-tensor"
+            )
+        ],
    ),
    "llava_onevision-multiple-images": VLMTestInfo(
        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
@@ -103,6 +103,10 @@ def run_test(
        )


+@pytest.mark.skip(
+    reason="Model's custom MBart decoder has head count mismatch with "
+    "transformers v5's GQA-aware cross-attention (8 vs 16 heads)"
+)
@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("num_logprobs", [5])
@@ -2,9 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from collections.abc import Sequence
+from importlib.metadata import version

 import pytest
 import regex as re
+from packaging.version import Version
 from transformers import AutoModelForCausalLM, AutoTokenizer

 from vllm.logprobs import SampleLogprobs
@@ -19,6 +21,15 @@ from ....conftest import (
 from ....utils import multi_gpu_test
 from ...utils import check_logprobs_close

+pytestmark = pytest.mark.skipif(
+    Version("5.0") <= Version(version("transformers")),
+    reason=(
+        "vllm upgraded transformers above v5.4 where HF model custom code uses siglip2 "
+        "internals (filter_out_non_signature_kwargs) removed by "
+        "huggingface/transformers#43514"
+    ),
+)
+
 MODEL_ID = "microsoft/Phi-4-reasoning-vision-15B"

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
@@ -149,6 +149,10 @@ def test_online_serving(vllm_runner, audio_assets: AudioTestAssets):
    )


+@pytest.mark.skip(
+    reason="VoxtralProcessor.apply_chat_template() in transformers v5 "
+    "doesn't resolve chat_template=None to the default template"
+)
 def test_hf_reference(hf_runner, vllm_runner, audio_assets: AudioTestAssets):
    """Compare vLLM Mistral-format output against HF Transformers reference.

@@ -80,6 +80,11 @@ def run_test(
    if vllm_runner_kwargs:
        vllm_runner_kwargs_.update(vllm_runner_kwargs)

+    # Avoid passing limit_mm_per_prompt twice when vllm_runner_kwargs
+    # already contains it (e.g. gemma4 sets it via vllm_runner_kwargs).
+    if "limit_mm_per_prompt" in vllm_runner_kwargs_:
+        limit_mm_per_prompt = vllm_runner_kwargs_.pop("limit_mm_per_prompt")
+
    with vllm_runner(
        model,
        max_model_len=max_model_len,
@@ -22,6 +22,11 @@ from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam

 from ....conftest import VllmRunner

+pytestmark = pytest.mark.skip(
+    reason="ColQwen3 model's weight tying is incompatible with "
+    "transformers v5 (missing all_tied_weights_keys)"
+)
+
 MODELS = [
    "TomoroAI/tomoro-colqwen3-embed-4b",
    "OpenSearch-AI/Ops-Colqwen3-4B",
@@ -12,6 +12,11 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

 from ....conftest import ImageTestAssets

+pytestmark = pytest.mark.skip(
+    reason="InternVisionModel's custom code is incompatible with "
+    "transformers v5 (missing all_tied_weights_keys)"
+)
+
 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
 DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
@@ -15,6 +15,11 @@ from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam

 from ....conftest import HfRunner, VllmRunner

+pytestmark = pytest.mark.skip(
+    reason="jinaai/jina-reranker-m0 custom code is incompatible with "
+    "transformers v5 (missing all_tied_weights_keys)"
+)
+
 MODELS = ["jinaai/jina-reranker-m0"]

 MM_PROCESSOR_KWARGS = {
@@ -17,11 +17,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from importlib.metadata import version
 from unittest.mock import MagicMock

 import numpy as np
 import pytest
 import torch
+from packaging.version import Version
 from transformers import PretrainedConfig

 from tests.models.registry import HF_EXAMPLE_MODELS
@@ -122,6 +124,11 @@ def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx):
    assert builder.get_dummy_text({"audio": 2}) == "<sound><sound>"


+@pytest.mark.skipif(
+    Version(version("transformers")) >= Version("5.5"),
+    reason="transformers v5.5 added native MusicFlamingoForConditionalGeneration "
+    "with a different get_audio_features signature (requires input_ids)",
+)
 def test_musicflamingo_audio_feature_pipeline_matches_hf_small_config():
    from transformers.models.musicflamingo import (
        modeling_musicflamingo as hf_musicflamingo_modeling,
@@ -335,7 +335,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
        "internlm/internlm2-chat-7b", trust_remote_code=True
    ),
    "InternLM2VEForCausalLM": _HfExamplesInfo(
-        "OpenGVLab/Mono-InternVL-2B", trust_remote_code=True
+        "OpenGVLab/Mono-InternVL-2B",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": (
+                "Custom config cannot be loaded with Transformers "
+                "v5 because `vision_config` is not always set"
+            )
+        },
    ),
    "InternLM3ForCausalLM": _HfExamplesInfo(
        "internlm/internlm3-8b-instruct", trust_remote_code=True
@@ -475,6 +483,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Plamo2ForCausalLM": _HfExamplesInfo(
        "pfnet/plamo-2-1b",
        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "hf": (
+                "Custom model code uses `_tied_weight_keys: list[str]` but "
+                "Transformers v5 now expects `_tied_weight_keys: dict[str, str]`"
+            )
+        },
    ),
    "Plamo3ForCausalLM": _HfExamplesInfo(
        "pfnet/plamo-3-nict-2b-base",
@@ -515,6 +530,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
        trust_remote_code=True,
        max_model_len=4096,
        is_available_online=True,
+        max_transformers_version="5.3",
+        transformers_version_reason={
+            "vllm": (
+                "vllm upgraded transformers above v5.4 where "
+                "validate_rope() no longer accepts ignore_keys param"
+            )
+        },
    ),
    "SeedOssForCausalLM": _HfExamplesInfo(
        "ByteDance-Seed/Seed-OSS-36B-Instruct",
@@ -553,6 +575,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
        "xverse/XVERSE-7B-Chat",
        tokenizer="meta-llama/Llama-2-7b",
        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "XVERSE tokenizer is incompatible with transformers v5 "
+            "(add_prefix_space / prepend_scheme mismatch).",
+        },
    ),
    "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
    "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True),
@@ -763,10 +790,18 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    # [Decoder-only]
    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
    "AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
-        "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0"
+        "nvidia/audio-flamingo-3-hf",
+        min_transformers_version="5.3.0",
+        transformers_version_reason={
+            "vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
+        },
    ),
    "MusicFlamingoForConditionalGeneration": _HfExamplesInfo(
-        "nvidia/music-flamingo-2601-hf", min_transformers_version="5.3.0"
+        "nvidia/music-flamingo-2601-hf",
+        min_transformers_version="5.3.0",
+        transformers_version_reason={
+            "vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
+        },
    ),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
    "BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
@@ -821,12 +856,30 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    ),
    "FireRedASR2ForConditionalGeneration": _HfExamplesInfo(
        "allendou/FireRedASR2-LLM-vllm",
+        trust_remote_code=True,
+        max_transformers_version="5.1",
+        transformers_version_reason={
+            "vllm": "Incompatible with transformers v5.2+ "
+            "(dict object has no attribute '__name__').",
+        },
    ),
    "FireRedLIDForConditionalGeneration": _HfExamplesInfo(
        "PatchyTisa/FireRedLID-vllm",
+        trust_remote_code=True,
+        max_transformers_version="5.1",
+        transformers_version_reason={
+            "vllm": "Incompatible with transformers v5.2+ "
+            "(dict object has no attribute '__name__').",
+        },
    ),
    "FunASRForConditionalGeneration": _HfExamplesInfo(
        "allendou/Fun-ASR-Nano-2512-vllm",
+        trust_remote_code=True,
+        max_transformers_version="5.1",
+        transformers_version_reason={
+            "vllm": "Incompatible with transformers v5.2+ "
+            "(dict object has no attribute '__name__').",
+        },
    ),
    "FunAudioChatForConditionalGeneration": _HfExamplesInfo(
        "funaudiochat", is_available_online=False
@@ -868,6 +921,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "HCXVisionForCausalLM": _HfExamplesInfo(
        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": (
+                "Custom config cannot be loaded with Transformers "
+                "v5 because `text_config` is not always set"
+            )
+        },
    ),
    "HCXVisionV2ForCausalLM": _HfExamplesInfo(
        "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
@@ -887,7 +947,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"},
    ),
    "InternS1ForConditionalGeneration": _HfExamplesInfo(
-        "internlm/Intern-S1", trust_remote_code=True
+        "internlm/Intern-S1",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "Custom tokenizer code is not compatible with Transformers v5."
+        },
    ),
    "InternS1ProForConditionalGeneration": _HfExamplesInfo(
        "internlm/Intern-S1-Pro",
@@ -976,7 +1041,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "MiDashengLMModel": _HfExamplesInfo(
        "mispeech/midashenglm-7b", trust_remote_code=True
    ),
-    "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", trust_remote_code=True),
+    "MiniCPMO": _HfExamplesInfo(
+        "openbmb/MiniCPM-o-2_6",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "hf": "Custom processor code is not compatible with Transformers v5."
+        },
+    ),
    "MiniCPMV": _HfExamplesInfo(
        "openbmb/MiniCPM-Llama3-V-2_5",
        extras={
@@ -984,6 +1056,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
            "4.0": "openbmb/MiniCPM-V-4",
            "4.5": "openbmb/MiniCPM-V-4_5",
        },
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": (
+                "MiniCPMVBatchFeature is incompatible with its base class in "
+                "Transformers v5. See https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/discussions/78"
+            )
+        },
        trust_remote_code=True,
    ),
    "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo(
@@ -1083,13 +1162,25 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        trust_remote_code=True,
    ),
    "OpenCUAForConditionalGeneration": _HfExamplesInfo(
-        "xlangai/OpenCUA-7B", trust_remote_code=True
+        "xlangai/OpenCUA-7B",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "Tokenizer cannot be initialised in Transformers v5."
+        },
    ),
    "OpenPanguVLForConditionalGeneration": _HfExamplesInfo(
        "FreedomIntelligence/openPangu-VL-7B",
        trust_remote_code=True,
        max_model_len=4096,
        enforce_eager=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": (
+                "OpenPanguVLVideoProcessorInitKwargs does not specify total=False, "
+                "making all kwargs required. See https://huggingface.co/FreedomIntelligence/openPangu-VL-7B/discussions/2"
+            )
+        },
    ),
    "Ovis": _HfExamplesInfo(
        "AIDC-AI/Ovis2-1B",
@@ -1101,12 +1192,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
            "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B",
        },
    ),
-    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True),
+    "Ovis2_5": _HfExamplesInfo(
+        "AIDC-AI/Ovis2.5-2B",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "Custom processor code is not compatible with Transformers v5."
+        },
+    ),
    "Ovis2_6ForCausalLM": _HfExamplesInfo(
        "AIDC-AI/Ovis2.6-2B", is_available_online=False, trust_remote_code=True
    ),
    "Ovis2_6_MoeForCausalLM": _HfExamplesInfo(
-        "AIDC-AI/Ovis2.6-30B-A3B", trust_remote_code=True
+        "AIDC-AI/Ovis2.6-30B-A3B",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "Custom processor code is not compatible with Transformers v5."
+        },
    ),
    "PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
        "PaddlePaddle/PaddleOCR-VL",
@@ -1126,7 +1229,17 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"},
    ),
    "Phi4ForCausalLMV": _HfExamplesInfo(
-        "microsoft/Phi-4-reasoning-vision-15B", trust_remote_code=True
+        "microsoft/Phi-4-reasoning-vision-15B",
+        trust_remote_code=True,
+        max_transformers_version="5.3",
+        transformers_version_reason={
+            "vllm": (
+                "vllm upgraded transformers above v5.4 where HF model "
+                "custom code uses siglip2 internals "
+                "(filter_out_non_signature_kwargs) removed "
+                "by huggingface/transformers#43514"
+            )
+        },
    ),
    "Phi4MMForCausalLM": _HfExamplesInfo(
        "microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
@@ -1223,6 +1336,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
            "architectures": ["Tarsier2ForConditionalGeneration"],
            "model_type": "tarsier2",
        },
+        max_transformers_version="5.3",
+        transformers_version_reason={
+            "vllm": (
+                "Qwen2VLConfig was split into Qwen2VLConfig + "
+                "Qwen2VLTextConfig in transformers v5, breaking "
+                "attribute access (num_attention_heads, hidden_size, etc.)"
+            )
+        },
    ),
    "VoxtralForConditionalGeneration": _HfExamplesInfo(
        "mistralai/Voxtral-Mini-3B-2507",
@@ -476,7 +476,16 @@ def dummy_hf_overrides(
    else:
        # Use minimal layers for testing
        num_layers = 1
-        num_hidden_layers = 3 if model_arch == "Gemma3nForConditionalGeneration" else 1
+        num_hidden_layers = (
+            3
+            if model_arch
+            in (
+                "Gemma3nForConditionalGeneration",
+                "Gemma4ForCausalLM",
+                "Gemma4ForConditionalGeneration",
+            )
+            else 1
+        )

    update_dict = {
        "num_layers": num_layers,
@@ -2,10 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
-from transformers import AutoTokenizer

 from tests.reasoning.utils import run_reasoning_extraction
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.tokenizers import get_tokenizer

 parser_name = "step3p5"
 start_token = "<think>"
@@ -16,7 +16,7 @@ REASONING_MODEL_NAME = "stepfun-ai/Step-3.5-Flash"

@pytest.fixture(scope="module")
 def step3p5_tokenizer():
-    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+    return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME)


 SIMPLE_REASONING = {
@@ -557,12 +557,16 @@ def test_eagle_correctness_light(
            "auto",
            0.8,
        ),
-        (
+        pytest.param(
            ("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1),
            False,
            False,
            "transformers",
            0.8,
+            # TODO(hmellor): figure out why memory usage is so high
+            marks=pytest.mark.skip(
+                reason="Feature is experimental and uses too much memory in CI",
+            ),
        ),
        pytest.param(
            (
@@ -265,12 +265,24 @@ class GGUFModelLoader(BaseModelLoader):
                GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight')
                or None if no mapping found
            """
+            # In transformers v5, multimodal models (e.g. Gemma3) wrap
+            # all sub-models under an outer 'model.' attribute, producing
+            # state_dict keys like 'model.language_model.layers.0...' and
+            # 'model.vision_tower.vision_model...'.  Strip this outer
+            # prefix so the keys match what gguf-py expects.
+            if is_multimodal and hf_name.startswith("model."):
+                hf_name = hf_name[6:]  # Remove outer 'model.'
+
            # Strip 'language_model.' prefix for multimodal models - gguf-py
            # tensor mappings expect parameter names without this prefix.
            # Note: 'model.' prefix should be KEPT for text-only models as
            # gguf-py expects it.
            if hf_name.startswith("language_model."):
                hf_name = hf_name[15:]  # Remove 'language_model.'
+                # Re-add 'model.' prefix because gguf-py text tensor maps
+                # expect 'model.layers...' format.
+                if is_multimodal:
+                    hf_name = "model." + hf_name

            # Parse parameter name and suffix
            if hf_name.endswith((".weight", ".bias")):
@@ -125,8 +125,12 @@ class Gemma4AudioInputs(TensorSchema):
    """

    type: Literal["audio"] = "audio"
-    input_features_padded: Annotated[torch.Tensor, TensorShape("bn", "s", "f")]
-    input_features_mask: Annotated[torch.Tensor, TensorShape("bn", "s")]
+    input_features_padded: Annotated[
+        torch.Tensor, TensorShape("bn", "s", "f", dynamic_dims={"s"})
+    ]
+    input_features_mask: Annotated[
+        torch.Tensor, TensorShape("bn", "s", dynamic_dims={"s"})
+    ]


 Gemma4ImageInputs = Gemma4ImagePixelInputs
@@ -510,6 +514,8 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
            video_timestamps_per_video: list[list[float]] = []
            video_frame_counts: list[int] = []

+            video_replacements: list[str] = []
+
            for item in videos:
                video_array, metadata = item

@@ -562,10 +568,7 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
                video_timestamps_per_video.append(timestamps)
                video_frame_counts.append(len(frames))

-                # Build expanded replacement text and replace the
-                # <|video|> placeholder in the prompt.
-                # Use split(token, 1) to avoid collision — the
-                # replacement text itself contains <|video|> tokens.
+                # Build expanded replacement text for this video.
                ts_strs = [f"{int(s // 60):02d}:{int(s % 60):02d}" for s in timestamps]
                replacement = " ".join(
                    f"{t} {processor.boi_token}"
@@ -573,9 +576,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
                    f"{processor.eoi_token}"
                    for t, n in zip(ts_strs, num_soft_per_frame)
                )
-                parts = prompt.split(processor.video_token, 1)
-                if len(parts) == 2:
-                    prompt = parts[0] + replacement + parts[1]
+                video_replacements.append(replacement)
+
+            # Replace all <|video|> placeholders at once. We split on
+            # video_token to get N+1 parts, then interleave with the
+            # N replacement strings. This avoids the iterative
+            # split-replace bug where replacement text (which itself
+            # contains <|video|> tokens) collides with later splits.
+            vt = processor.video_token
+            parts = prompt.split(vt, len(video_replacements))
+
+            # NOTE: len(parts) <= len(video_replacements) + 1
+            parts_with_repl: list[str] = []
+            for part, repl in zip(parts, video_replacements):
+                parts_with_repl.extend([part, repl])
+            parts_with_repl.extend(parts[len(video_replacements) :])
+
+            prompt = "".join(parts_with_repl)

            video_outputs = {
                "pixel_values_videos": torch.cat(all_video_pixel_values, dim=0),
@@ -638,19 +655,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
            )

        if "input_features" in processed_outputs:
-            # Keep padded features for batched audio tower execution.
-            processed_outputs["input_features_padded"] = processed_outputs[
-                "input_features"
-            ]
-            # Unpad per-item so each item's cache entry is self-contained.
+            # Unpad per-item so each item's cache entry is
+            # self-contained. The batched() field config in
+            # _get_mm_fields_config will re-pad all fields to the
+            # batch's max length at batch time, ensuring consistent
+            # padding regardless of cache history.
+            masks = processed_outputs["input_features_mask"]
            unpadded_features = [
                f[mask]
                for f, mask in zip(
                    processed_outputs["input_features"],
-                    processed_outputs["input_features_mask"],
+                    masks,
                )
            ]
+            unpadded_masks = [mask[mask] for mask in masks]
            processed_outputs["input_features"] = unpadded_features
+            processed_outputs["input_features_padded"] = unpadded_features
+            processed_outputs["input_features_mask"] = unpadded_masks

        # Merge video outputs into the final result
        combined_outputs = dict(processed_outputs, **video_outputs)
@@ -32,9 +32,9 @@ from transformers.models.musicflamingo import (

 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
+from vllm.inputs import MultiModalDataDict
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
-    MultiModalDataDict,
    MultiModalFieldConfig,
    MultiModalKwargsItems,
 )
@@ -275,6 +275,11 @@ class Base(
        )
        class SupportTorchCompileWrapper(cls): ...

+        # Preserve __module__ so transformers v5's source-file checks
+        # (e.g. _can_set_experts_implementation) read the original
+        # model's module instead of this file.
+        SupportTorchCompileWrapper.__module__ = cls.__module__
+
        # Patch the class in its module
        module = sys.modules[cls.__module__]
        setattr(module, cls.__name__, SupportTorchCompileWrapper)
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
 from dataclasses import dataclass, field
 from functools import lru_cache
 from pathlib import Path
@@ -10,6 +11,7 @@ from typing_extensions import TypeVar, assert_never

 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import get_config
 from vllm.transformers_utils.gguf_utils import (
    check_gguf_file,
    get_gguf_file_path_from_hf,
@@ -31,6 +33,13 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)


+# Model types whose hub tokenizer_class is incorrect and should be overridden with
+# TokenizersBackend (the generic fast tokenizer). Adding a model type here is always a
+# temporary workaround and better long term solutions are:
+# - Add model type to MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS in transformers (better)
+# - Fix tokenizer_class on the hub for the affected models (best)
+_MODEL_TYPES_WITH_INCORRECT_TOKENIZER_CLASS: set[str] = {"step3_vl"}
+
 _VLLM_TOKENIZERS = {
    "deepseek_v32": ("deepseek_v32", "DeepseekV32Tokenizer"),
    "grok2": ("grok2", "Grok2Tokenizer"),
@@ -202,7 +211,31 @@ def get_tokenizer(
        **kwargs,
    )

-    if tokenizer_cls == TokenizerLike:
+    # Ensure that, if the config were to come from vllm.transformers_utils.config, it is
+    # registered with AutoConfig before the tokenizer is loaded. This is necessary since
+    # tokenizer_cls_.from_pretrained will call AutoConfig.from_pretrained internally.
+    # This may fail for paths that don't have a model config (e.g. LoRA adapters),
+    # which is fine — those don't need custom config registration.
+    config = None
+    with contextlib.suppress(ValueError, OSError):
+        config = get_config(
+            tokenizer_name,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+        )
+
+    # Some models have an incorrect tokenizer_class on the hub.
+    # For these model types, bypass AutoTokenizer and use TokenizersBackend directly.
+    model_type = getattr(config, "model_type", None) if config else None
+    if model_type in _MODEL_TYPES_WITH_INCORRECT_TOKENIZER_CLASS:
+        from transformers.tokenization_utils_tokenizers import TokenizersBackend
+
+        logger.debug(
+            "Overriding tokenizer_class to TokenizersBackend for model_type=%r",
+            model_type,
+        )
+        tokenizer_cls_ = TokenizersBackend
+    elif tokenizer_cls == TokenizerLike:
        tokenizer_cls_ = TokenizerRegistry.load_tokenizer_cls(tokenizer_mode)
    else:
        tokenizer_cls_ = tokenizer_cls