mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
Update to transformers v5 (#30566)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: khluu <khluu000@gmail.com> Signed-off-by: Kevin H. Luu <khluu000@gmail.com> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: khluu <khluu000@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: jiang1.li <jiang1.li@intel.com>
This commit is contained in:
@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
|
||||
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
|
||||
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 -e VLLM_CPU_ATTN_SPLIT_KV=0 --shm-size=4g "$IMAGE_NAME" \
|
||||
timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
|
||||
|
||||
@@ -4,7 +4,6 @@ depends_on:
|
||||
steps:
|
||||
- label: Basic Models Tests (Initialization)
|
||||
timeout_in_minutes: 45
|
||||
device: h200_18gb
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -73,3 +72,18 @@ steps:
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
- label: Transformers Backward Compatibility Models Test
|
||||
working_dir: "/vllm-workspace/"
|
||||
optional: true
|
||||
soft_fail: true
|
||||
commands:
|
||||
- pip install transformers==4.57.5
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
+5
-4
@@ -642,7 +642,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
else \
|
||||
BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
|
||||
fi; \
|
||||
uv pip install --system accelerate hf_transfer modelscope \
|
||||
uv pip install --system accelerate modelscope \
|
||||
"bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}"
|
||||
|
||||
# ============================================================
|
||||
@@ -756,9 +756,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -e tests/vllm_test_utils
|
||||
|
||||
# enable fast downloads from hf (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system hf_transfer
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
||||
ENV HF_XET_HIGH_PERFORMANCE 1
|
||||
|
||||
# increase timeout for hf downloads (for testing)
|
||||
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
|
||||
|
||||
# Copy in the v1 package for testing (it isn't distributed yet)
|
||||
COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
|
||||
|
||||
@@ -197,6 +197,12 @@ ADD ./.buildkite/ ./.buildkite/
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install -e tests/vllm_test_utils
|
||||
|
||||
# enable fast downloads from hf (for testing)
|
||||
ENV HF_XET_HIGH_PERFORMANCE 1
|
||||
|
||||
# increase timeout for hf downloads (for testing)
|
||||
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
|
||||
|
||||
######################### RELEASE IMAGE #########################
|
||||
FROM base AS vllm-openai
|
||||
|
||||
|
||||
@@ -272,9 +272,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -e tests/vllm_test_utils
|
||||
|
||||
# enable fast downloads from hf (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system hf_transfer
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
||||
ENV HF_XET_HIGH_PERFORMANCE 1
|
||||
|
||||
# increase timeout for hf downloads (for testing)
|
||||
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/test/nightly-torch.txt
|
||||
|
||||
@@ -365,9 +365,10 @@ RUN cd /vllm-workspace \
|
||||
&& python3 -m pip install pytest-shard
|
||||
|
||||
# enable fast downloads from hf (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system hf_transfer
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER=1
|
||||
ENV HF_XET_HIGH_PERFORMANCE=1
|
||||
|
||||
# increase timeout for hf downloads (for testing)
|
||||
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
|
||||
|
||||
# install audio decode package `torchcodec` from source (required due to
|
||||
# ROCm and torch version mismatch) for tests with datasets package
|
||||
|
||||
@@ -240,7 +240,7 @@ uv pip install vllm==${VLLM_VERSION} \
|
||||
# Install dependencies
|
||||
pip install --upgrade numba \
|
||||
scipy \
|
||||
huggingface-hub[cli,hf_transfer] \
|
||||
huggingface-hub[cli] \
|
||||
setuptools_scm
|
||||
pip install -r requirements/rocm.txt
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ requests >= 2.26.0
|
||||
tqdm
|
||||
blake3
|
||||
py-cpuinfo
|
||||
transformers >= 4.56.0, < 5
|
||||
transformers >= 4.56.0, != 5.0.*, != 5.1.*, != 5.2.*, != 5.3.*, != 5.4.*, != 5.5.0
|
||||
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
|
||||
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
|
||||
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
|
||||
@@ -37,7 +37,7 @@ pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||
einops # Required for Qwen2-VL.
|
||||
compressed-tensors == 0.14.0.1 # required for compressed-tensors
|
||||
compressed-tensors == 0.15.0.1 # required for compressed-tensors
|
||||
depyf==0.20.0 # required for profiling and debugging with compilation config
|
||||
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
|
||||
watchfiles # required for http server to monitor the updates of TLS files
|
||||
|
||||
@@ -18,7 +18,7 @@ httpx
|
||||
librosa # required for audio tests
|
||||
vector_quantize_pytorch # required for minicpmo_26 test
|
||||
vocos # required for minicpmo_26 test
|
||||
peft>=0.15.0 # required for phi-4-mm test
|
||||
peft>=0.18.1 # required for phi-4-mm test
|
||||
pqdm
|
||||
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
|
||||
resampy # required for audio tests
|
||||
@@ -39,8 +39,8 @@ opencv-python-headless >= 4.13.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api]>=0.4.11 # required for model evaluation test
|
||||
mteb[bm25s]>=2, <3 # required for mteb test
|
||||
transformers==4.57.5
|
||||
tokenizers==0.22.0
|
||||
transformers==5.5.3
|
||||
tokenizers==0.22.2
|
||||
schemathesis>=3.39.15 # Required for openai schema test.
|
||||
# quantization
|
||||
bitsandbytes==0.49.2
|
||||
|
||||
+10
-10
@@ -4,7 +4,7 @@ absl-py==2.1.0
|
||||
# via
|
||||
# rouge-score
|
||||
# tensorboard
|
||||
accelerate==1.0.1
|
||||
accelerate==1.13.0
|
||||
# via peft
|
||||
aenum==3.1.16
|
||||
# via lightly
|
||||
@@ -248,7 +248,6 @@ filelock==3.16.1
|
||||
# huggingface-hub
|
||||
# ray
|
||||
# torch
|
||||
# transformers
|
||||
# virtualenv
|
||||
fiona==1.10.1
|
||||
# via torchgeo
|
||||
@@ -331,7 +330,7 @@ h5py==3.13.0
|
||||
# via terratorch
|
||||
harfile==0.3.0
|
||||
# via schemathesis
|
||||
hf-xet==1.1.7
|
||||
hf-xet==1.4.3
|
||||
# via huggingface-hub
|
||||
hiredis==3.0.0
|
||||
# via tensorizer
|
||||
@@ -345,9 +344,10 @@ httpx==0.27.2
|
||||
# via
|
||||
# -r requirements/test/cuda.in
|
||||
# diffusers
|
||||
# huggingface-hub
|
||||
# perceptron
|
||||
# schemathesis
|
||||
huggingface-hub==0.36.2
|
||||
huggingface-hub==1.10.2
|
||||
# via
|
||||
# accelerate
|
||||
# datasets
|
||||
@@ -756,7 +756,7 @@ pathvalidate==3.2.1
|
||||
# via pytablewriter
|
||||
patsy==1.0.1
|
||||
# via statsmodels
|
||||
peft==0.16.0
|
||||
peft==0.18.1
|
||||
# via -r requirements/test/cuda.in
|
||||
perceptron==0.1.4
|
||||
# via -r requirements/test/cuda.in
|
||||
@@ -982,7 +982,7 @@ referencing==0.35.1
|
||||
# via
|
||||
# jsonschema
|
||||
# jsonschema-specifications
|
||||
regex==2024.9.11
|
||||
regex==2026.2.28
|
||||
# via
|
||||
# diffusers
|
||||
# nltk
|
||||
@@ -1002,7 +1002,6 @@ requests==2.32.3
|
||||
# google-api-core
|
||||
# google-cloud-storage
|
||||
# gpt-oss
|
||||
# huggingface-hub
|
||||
# lightly
|
||||
# lm-eval
|
||||
# mistral-common
|
||||
@@ -1015,7 +1014,6 @@ requests==2.32.3
|
||||
# starlette-testclient
|
||||
# tacoreader
|
||||
# tiktoken
|
||||
# transformers
|
||||
# wandb
|
||||
resampy==0.4.3
|
||||
# via -r requirements/test/cuda.in
|
||||
@@ -1216,7 +1214,7 @@ timm==1.0.17
|
||||
# segmentation-models-pytorch
|
||||
# terratorch
|
||||
# torchgeo
|
||||
tokenizers==0.22.0
|
||||
tokenizers==0.22.2
|
||||
# via
|
||||
# -c requirements/common.txt
|
||||
# -r requirements/test/cuda.in
|
||||
@@ -1295,7 +1293,7 @@ tqdm==4.67.3
|
||||
# tacoreader
|
||||
# terratorch
|
||||
# transformers
|
||||
transformers==4.57.5
|
||||
transformers==5.5.3
|
||||
# via
|
||||
# -c requirements/common.txt
|
||||
# -r requirements/test/cuda.in
|
||||
@@ -1317,7 +1315,9 @@ typepy==1.3.2
|
||||
typer==0.15.2
|
||||
# via
|
||||
# fastsafetensors
|
||||
# huggingface-hub
|
||||
# perceptron
|
||||
# transformers
|
||||
types-python-dateutil==2.9.0.20241206
|
||||
# via arrow
|
||||
typeshed-client==2.8.2
|
||||
|
||||
@@ -29,8 +29,8 @@ opencv-python-headless >= 4.13.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api]>=0.4.11 # required for model evaluation test
|
||||
mteb[bm25s]>=2, <3 # required for mteb test
|
||||
transformers==4.57.5
|
||||
tokenizers==0.22.0
|
||||
transformers==5.5.3
|
||||
tokenizers==0.22.2
|
||||
schemathesis>=3.39.15 # Required for openai schema test.
|
||||
# quantization
|
||||
bitsandbytes>=0.49.2
|
||||
|
||||
@@ -38,8 +38,8 @@ opencv-python-headless>=4.13.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api]>=0.4.11 # required for model evaluation test
|
||||
mteb[bm25s]>=2, <3 # required for mteb test
|
||||
transformers==4.57.5
|
||||
tokenizers==0.22.0
|
||||
transformers==5.5.3
|
||||
tokenizers==0.22.2
|
||||
schemathesis>=3.39.15 # Required for openai schema test
|
||||
# quantization
|
||||
bitsandbytes==0.49.2
|
||||
@@ -82,4 +82,3 @@ plotly # required for perf comparison html report
|
||||
rapidfuzz
|
||||
torchgeo==0.7.0
|
||||
multiprocess==0.70.16
|
||||
huggingface-hub==0.36.2
|
||||
|
||||
+15
-16
@@ -39,7 +39,7 @@ annotated-doc==0.0.4
|
||||
# typer
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
anthropic==0.89.0
|
||||
anthropic==0.93.0
|
||||
# via
|
||||
# -c requirements/common.txt
|
||||
# -r requirements/test/../common.txt
|
||||
@@ -172,7 +172,7 @@ colorful==0.5.8
|
||||
# via ray
|
||||
colorlog==6.10.1
|
||||
# via optuna
|
||||
compressed-tensors==0.14.0.1
|
||||
compressed-tensors==0.15.0.1
|
||||
# via
|
||||
# -c requirements/common.txt
|
||||
# -r requirements/test/../common.txt
|
||||
@@ -269,9 +269,9 @@ fastapi==0.135.2
|
||||
# model-hosting-container-standards
|
||||
fastapi-cli==0.0.24
|
||||
# via fastapi
|
||||
fastapi-cloud-cli==0.15.1
|
||||
fastapi-cloud-cli==0.16.1
|
||||
# via fastapi-cli
|
||||
fastar==0.9.0
|
||||
fastar==0.10.0
|
||||
# via fastapi-cloud-cli
|
||||
fastparquet==2026.3.0
|
||||
# via genai-perf
|
||||
@@ -290,7 +290,6 @@ filelock==3.25.2
|
||||
# python-discovery
|
||||
# ray
|
||||
# torch
|
||||
# transformers
|
||||
# virtualenv
|
||||
fiona==1.10.1
|
||||
# via torchgeo
|
||||
@@ -384,7 +383,7 @@ h5py==3.16.0
|
||||
# via terratorch
|
||||
harfile==0.4.0
|
||||
# via schemathesis
|
||||
hf-xet==1.4.2
|
||||
hf-xet==1.4.3
|
||||
# via huggingface-hub
|
||||
hiredis==3.3.1
|
||||
# via tensorizer
|
||||
@@ -403,6 +402,7 @@ httpx==0.27.2
|
||||
# diffusers
|
||||
# fastapi
|
||||
# fastapi-cloud-cli
|
||||
# huggingface-hub
|
||||
# mcp
|
||||
# model-hosting-container-standards
|
||||
# openai
|
||||
@@ -410,9 +410,8 @@ httpx==0.27.2
|
||||
# schemathesis
|
||||
httpx-sse==0.4.3
|
||||
# via mcp
|
||||
huggingface-hub==0.36.2
|
||||
huggingface-hub==1.10.2
|
||||
# via
|
||||
# -r requirements/test/rocm.in
|
||||
# accelerate
|
||||
# datasets
|
||||
# diffusers
|
||||
@@ -484,7 +483,7 @@ jinja2==3.1.6
|
||||
# genai-perf
|
||||
# lm-eval
|
||||
# torch
|
||||
jiter==0.13.0
|
||||
jiter==0.14.0
|
||||
# via
|
||||
# anthropic
|
||||
# openai
|
||||
@@ -631,7 +630,7 @@ msgpack==1.1.2
|
||||
# via
|
||||
# librosa
|
||||
# ray
|
||||
msgspec==0.20.0
|
||||
msgspec==0.21.0
|
||||
# via -r requirements/test/../common.txt
|
||||
mteb==2.11.5
|
||||
# via -r requirements/test/rocm.in
|
||||
@@ -742,7 +741,7 @@ omegaconf==2.3.0
|
||||
# lightning
|
||||
open-clip-torch==2.32.0
|
||||
# via -r requirements/test/rocm.in
|
||||
openai==2.30.0
|
||||
openai==2.31.0
|
||||
# via
|
||||
# -c requirements/common.txt
|
||||
# -r requirements/test/../common.txt
|
||||
@@ -1093,7 +1092,7 @@ python-dotenv==1.2.2
|
||||
# uvicorn
|
||||
python-json-logger==4.1.0
|
||||
# via -r requirements/test/../common.txt
|
||||
python-multipart==0.0.22
|
||||
python-multipart==0.0.26
|
||||
# via
|
||||
# fastapi
|
||||
# mcp
|
||||
@@ -1180,7 +1179,6 @@ requests==2.32.5
|
||||
# google-api-core
|
||||
# google-cloud-storage
|
||||
# gpt-oss
|
||||
# huggingface-hub
|
||||
# lightly
|
||||
# lm-eval
|
||||
# mistral-common
|
||||
@@ -1194,7 +1192,6 @@ requests==2.32.5
|
||||
# starlette-testclient
|
||||
# tacoreader
|
||||
# tiktoken
|
||||
# transformers
|
||||
# wandb
|
||||
resampy==0.4.3
|
||||
# via -r requirements/test/rocm.in
|
||||
@@ -1428,7 +1425,7 @@ timm==1.0.17
|
||||
# segmentation-models-pytorch
|
||||
# terratorch
|
||||
# torchgeo
|
||||
tokenizers==0.22.0
|
||||
tokenizers==0.22.2
|
||||
# via
|
||||
# -c requirements/common.txt
|
||||
# -r requirements/test/../common.txt
|
||||
@@ -1471,7 +1468,7 @@ tqdm==4.67.3
|
||||
# tacoreader
|
||||
# terratorch
|
||||
# transformers
|
||||
transformers==4.57.5
|
||||
transformers==5.5.3
|
||||
# via
|
||||
# -c requirements/common.txt
|
||||
# -r requirements/test/../common.txt
|
||||
@@ -1498,7 +1495,9 @@ typer==0.24.1
|
||||
# fastapi-cli
|
||||
# fastapi-cloud-cli
|
||||
# fastsafetensors
|
||||
# huggingface-hub
|
||||
# perceptron
|
||||
# transformers
|
||||
typeshed-client==2.9.0
|
||||
# via jsonargparse
|
||||
typing-extensions==4.15.0
|
||||
|
||||
@@ -13,7 +13,6 @@ pytest-shard
|
||||
absl-py
|
||||
accelerate
|
||||
arctic-inference
|
||||
hf_transfer
|
||||
lm_eval[api]
|
||||
modelscope
|
||||
|
||||
|
||||
@@ -19,7 +19,9 @@ aiosignal==1.4.0
|
||||
albumentations==1.4.6
|
||||
# via -r requirements/test/xpu.in
|
||||
annotated-doc==0.0.4
|
||||
# via fastapi
|
||||
# via
|
||||
# fastapi
|
||||
# typer
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
anyio==4.13.0
|
||||
@@ -64,6 +66,7 @@ click==8.3.1
|
||||
# jiwer
|
||||
# nltk
|
||||
# schemathesis
|
||||
# typer
|
||||
# uvicorn
|
||||
colorama==0.4.6
|
||||
# via sacrebleu
|
||||
@@ -112,7 +115,6 @@ filelock==3.25.2
|
||||
# huggingface-hub
|
||||
# modelscope
|
||||
# torch
|
||||
# transformers
|
||||
frozenlist==1.8.0
|
||||
# via
|
||||
# aiohttp
|
||||
@@ -133,9 +135,7 @@ h11==0.16.0
|
||||
# uvicorn
|
||||
harfile==0.4.0
|
||||
# via schemathesis
|
||||
hf-transfer==0.1.9
|
||||
# via -r requirements/test/xpu.in
|
||||
hf-xet==1.4.2
|
||||
hf-xet==1.4.3
|
||||
# via huggingface-hub
|
||||
html2text==2025.4.15
|
||||
# via gpt-oss
|
||||
@@ -144,8 +144,9 @@ httpcore==1.0.9
|
||||
httpx==0.28.1
|
||||
# via
|
||||
# datasets
|
||||
# huggingface-hub
|
||||
# schemathesis
|
||||
huggingface-hub==0.36.2
|
||||
huggingface-hub==1.10.2
|
||||
# via
|
||||
# accelerate
|
||||
# datasets
|
||||
@@ -515,7 +516,6 @@ requests==2.33.1
|
||||
# docker
|
||||
# evaluate
|
||||
# gpt-oss
|
||||
# huggingface-hub
|
||||
# lm-eval
|
||||
# mistral-common
|
||||
# modelscope
|
||||
@@ -524,11 +524,11 @@ requests==2.33.1
|
||||
# schemathesis
|
||||
# starlette-testclient
|
||||
# tiktoken
|
||||
# transformers
|
||||
rich==14.3.3
|
||||
# via
|
||||
# mteb
|
||||
# schemathesis
|
||||
# typer
|
||||
rouge-score==0.1.2
|
||||
# via lm-eval
|
||||
rpds-py==0.30.0
|
||||
@@ -572,6 +572,8 @@ setuptools==80.10.2
|
||||
# modelscope
|
||||
# pytablewriter
|
||||
# torch
|
||||
shellingham==1.5.4
|
||||
# via typer
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c requirements/common.txt
|
||||
@@ -665,7 +667,7 @@ tqdm==4.67.3
|
||||
# pqdm
|
||||
# sentence-transformers
|
||||
# transformers
|
||||
transformers==4.57.6
|
||||
transformers==5.5.3
|
||||
# via
|
||||
# -c requirements/common.txt
|
||||
# sentence-transformers
|
||||
@@ -676,6 +678,10 @@ typepy==1.3.4
|
||||
# dataproperty
|
||||
# pytablewriter
|
||||
# tabledata
|
||||
typer==0.24.1
|
||||
# via
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
typing-extensions==4.15.0
|
||||
# via
|
||||
# -c requirements/common.txt
|
||||
|
||||
@@ -410,6 +410,15 @@ class HfRunner:
|
||||
model_name,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
# HF runner should use the HF config so that it's consistent with the HF model
|
||||
if self.config.__module__.startswith("vllm.transformers_utils.configs"):
|
||||
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
|
||||
|
||||
del CONFIG_MAPPING._extra_content[self.config.model_type]
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
model_name,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
self.device = self.get_default_device()
|
||||
self.dtype = dtype = _get_and_verify_dtype(
|
||||
self.model_name,
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
import tempfile
|
||||
from collections import OrderedDict
|
||||
from importlib import reload
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
@@ -47,6 +48,11 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
|
||||
def maybe_enable_lora_dual_stream(monkeypatch: pytest.MonkeyPatch):
|
||||
if current_platform.is_cuda():
|
||||
monkeypatch.setenv("VLLM_LORA_ENABLE_DUAL_STREAM", "1")
|
||||
import vllm.lora.layers.base_linear
|
||||
|
||||
if not hasattr(vllm.lora.layers.base_linear, "lora_linear_async"):
|
||||
# Reload the module to ensure the environment variable takes effect.
|
||||
reload(vllm.lora.layers.base_linear)
|
||||
yield
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from importlib.metadata import version
|
||||
|
||||
import pytest
|
||||
from packaging.version import Version
|
||||
|
||||
import vllm
|
||||
from vllm.assets.image import ImageAsset
|
||||
@@ -10,6 +13,14 @@ from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
Version("5.0") <= Version(version("transformers")),
|
||||
reason=(
|
||||
"MiniCPMV custom processor uses tokenizer.im_start_id which is not "
|
||||
"available on TokenizersBackend in transformers v5.0+"
|
||||
),
|
||||
)
|
||||
|
||||
MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
|
||||
|
||||
PROMPT_TEMPLATE = (
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import huggingface_hub.constants
|
||||
@@ -10,26 +9,10 @@ from huggingface_hub.utils import LocalEntryNotFoundError
|
||||
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
download_weights_from_hf,
|
||||
enable_hf_transfer,
|
||||
maybe_remap_kv_scale_name,
|
||||
)
|
||||
|
||||
|
||||
def test_hf_transfer_auto_activation():
|
||||
if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
|
||||
# in case it is already set, we can't test the auto activation
|
||||
pytest.skip("HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
|
||||
enable_hf_transfer()
|
||||
try:
|
||||
# enable hf hub transfer if available
|
||||
import hf_transfer # type: ignore # noqa
|
||||
|
||||
HF_TRANSFER_ACTIVE = True
|
||||
except ImportError:
|
||||
HF_TRANSFER_ACTIVE = False
|
||||
assert huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == HF_TRANSFER_ACTIVE
|
||||
|
||||
|
||||
def test_download_weights_from_hf():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# assert LocalEntryNotFoundError error is thrown
|
||||
@@ -178,5 +161,4 @@ class TestMaybeRemapKvScaleName:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_hf_transfer_auto_activation()
|
||||
test_download_weights_from_hf()
|
||||
|
||||
@@ -143,6 +143,11 @@ def test_models(
|
||||
# in parts of the operators
|
||||
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
|
||||
|
||||
if current_platform.is_cpu() and model == "TitanML/tiny-mixtral":
|
||||
# This untrained model is sensitive to the rounding error
|
||||
# Fuse ops to reduce bfloat16 rounding
|
||||
monkeypatch.setenv("VLLM_CPU_CI_ENV", "0")
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
|
||||
@@ -69,7 +69,10 @@ MODELS = [
|
||||
attn_type="decoder",
|
||||
is_prefix_caching_supported=True,
|
||||
is_chunked_prefill_supported=True,
|
||||
enable_test=True,
|
||||
# Skip: model's custom tokenizer on HF hub is incompatible with
|
||||
# transformers v5 (sets attrs before super().__init__, triggering
|
||||
# AttributeError on 'verbose' in __getattr__).
|
||||
enable_test=False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@@ -72,7 +72,8 @@ MODELS = [
|
||||
attn_type="encoder_only",
|
||||
is_prefix_caching_supported=False,
|
||||
is_chunked_prefill_supported=False,
|
||||
enable_test=True,
|
||||
# Skip: numerical regression with transformers v5.
|
||||
enable_test=False,
|
||||
),
|
||||
########## ModernBertModel
|
||||
EmbedModelInfo(
|
||||
|
||||
@@ -75,6 +75,10 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
|
||||
mteb_test_rerank_models(vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="jinaai/jina-embeddings-v3 custom XLMRobertaLoRA model on HF hub "
|
||||
"is incompatible with transformers v5 (missing all_tied_weights_keys)"
|
||||
)
|
||||
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("dimensions", [16, 32])
|
||||
|
||||
@@ -186,7 +186,14 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModel,
|
||||
hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
marks=[
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.cpu_model,
|
||||
# TODO: Remove skip once model has been upstreamed to Transformers
|
||||
pytest.mark.skip(
|
||||
reason="Custom model code is not compatible with Transformers v5"
|
||||
),
|
||||
],
|
||||
),
|
||||
#### Transformers fallback to test
|
||||
## To reduce test burden, we only test batching arbitrary image size
|
||||
@@ -397,14 +404,14 @@ VLM_TEST_SETTINGS = {
|
||||
"gemma4": VLMTestInfo(
|
||||
models=["google/gemma-4-E2B-it"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
prompt_formatter=lambda img_prompt: f"<bos><|turn>user\n{img_prompt}<turn|>\n<|turn>model\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "What's the content in the center of the image?",
|
||||
"cherry_blossom": "What is the season?",
|
||||
"stop_sign": "<|image|>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<|image|>What is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="Describe the two images in detail.",
|
||||
multi_image_prompt="<|image|><|image|>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
@@ -533,6 +540,12 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=4096,
|
||||
use_tokenizer_eos=True,
|
||||
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||
# TODO: Remove skip once model has been upstreamed to Transformers
|
||||
marks=[
|
||||
pytest.mark.skip(
|
||||
reason="Custom model code tries to access data from meta-tensor"
|
||||
)
|
||||
],
|
||||
),
|
||||
"intern_vl-video": VLMTestInfo(
|
||||
models=[
|
||||
@@ -545,6 +558,12 @@ VLM_TEST_SETTINGS = {
|
||||
use_tokenizer_eos=True,
|
||||
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||
num_logprobs=10 if current_platform.is_rocm() else 5,
|
||||
# TODO: Remove skip once model has been upstreamed to Transformers
|
||||
marks=[
|
||||
pytest.mark.skip(
|
||||
reason="Custom model code tries to access data from meta-tensor"
|
||||
)
|
||||
],
|
||||
),
|
||||
"intern_vl-hf": VLMTestInfo(
|
||||
models=["OpenGVLab/InternVL3-1B-hf"],
|
||||
@@ -591,6 +610,8 @@ VLM_TEST_SETTINGS = {
|
||||
hf_model_kwargs={"device_map": "auto"},
|
||||
patch_hf_runner=model_utils.isaac_patch_hf_runner,
|
||||
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
# TODO: Remove skip once model has been upstreamed to Transformers
|
||||
marks=[pytest.mark.skip(reason="Custom model imports deleted object")], # noqa: E501
|
||||
),
|
||||
"kimi_vl": VLMTestInfo(
|
||||
models=["moonshotai/Kimi-VL-A3B-Instruct"],
|
||||
@@ -806,7 +827,12 @@ VLM_TEST_SETTINGS = {
|
||||
pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
|
||||
reason="This model is broken in Transformers v4.57.3",
|
||||
)
|
||||
),
|
||||
pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_VERSION) >= Version("5.0.0"),
|
||||
reason="Model's custom code uses ROPE_INIT_FUNCTIONS"
|
||||
"['default'] which was removed in transformers v5",
|
||||
),
|
||||
],
|
||||
),
|
||||
"phi3v": VLMTestInfo(
|
||||
@@ -960,6 +986,12 @@ VLM_TEST_SETTINGS = {
|
||||
)
|
||||
for inp in custom_inputs.different_patch_input_cases_internvl()
|
||||
],
|
||||
# TODO: Remove skip once model has been upstreamed to Transformers
|
||||
marks=[
|
||||
pytest.mark.skip(
|
||||
reason="Custom model code tries to access data from meta-tensor"
|
||||
)
|
||||
],
|
||||
),
|
||||
"llava_onevision-multiple-images": VLMTestInfo(
|
||||
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
||||
|
||||
@@ -103,6 +103,10 @@ def run_test(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="Model's custom MBart decoder has head count mismatch with "
|
||||
"transformers v5's GQA-aware cross-attention (8 vs 16 heads)"
|
||||
)
|
||||
@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
|
||||
@@ -2,9 +2,11 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
from importlib.metadata import version
|
||||
|
||||
import pytest
|
||||
import regex as re
|
||||
from packaging.version import Version
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
@@ -19,6 +21,15 @@ from ....conftest import (
|
||||
from ....utils import multi_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
Version("5.0") <= Version(version("transformers")),
|
||||
reason=(
|
||||
"vllm upgraded transformers above v5.4 where HF model custom code uses siglip2 "
|
||||
"internals (filter_out_non_signature_kwargs) removed by "
|
||||
"huggingface/transformers#43514"
|
||||
),
|
||||
)
|
||||
|
||||
MODEL_ID = "microsoft/Phi-4-reasoning-vision-15B"
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
|
||||
@@ -149,6 +149,10 @@ def test_online_serving(vllm_runner, audio_assets: AudioTestAssets):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="VoxtralProcessor.apply_chat_template() in transformers v5 "
|
||||
"doesn't resolve chat_template=None to the default template"
|
||||
)
|
||||
def test_hf_reference(hf_runner, vllm_runner, audio_assets: AudioTestAssets):
|
||||
"""Compare vLLM Mistral-format output against HF Transformers reference.
|
||||
|
||||
|
||||
@@ -80,6 +80,11 @@ def run_test(
|
||||
if vllm_runner_kwargs:
|
||||
vllm_runner_kwargs_.update(vllm_runner_kwargs)
|
||||
|
||||
# Avoid passing limit_mm_per_prompt twice when vllm_runner_kwargs
|
||||
# already contains it (e.g. gemma4 sets it via vllm_runner_kwargs).
|
||||
if "limit_mm_per_prompt" in vllm_runner_kwargs_:
|
||||
limit_mm_per_prompt = vllm_runner_kwargs_.pop("limit_mm_per_prompt")
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=max_model_len,
|
||||
|
||||
@@ -22,6 +22,11 @@ from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
|
||||
|
||||
from ....conftest import VllmRunner
|
||||
|
||||
pytestmark = pytest.mark.skip(
|
||||
reason="ColQwen3 model's weight tying is incompatible with "
|
||||
"transformers v5 (missing all_tied_weights_keys)"
|
||||
)
|
||||
|
||||
MODELS = [
|
||||
"TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"OpenSearch-AI/Ops-Colqwen3-4B",
|
||||
|
||||
@@ -12,6 +12,11 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ....conftest import ImageTestAssets
|
||||
|
||||
pytestmark = pytest.mark.skip(
|
||||
reason="InternVisionModel's custom code is incompatible with "
|
||||
"transformers v5 (missing all_tied_weights_keys)"
|
||||
)
|
||||
|
||||
# we use snapshot_download to prevent conflicts between
|
||||
# dynamic_module and trust_remote_code for hf_runner
|
||||
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
|
||||
|
||||
@@ -15,6 +15,11 @@ from vllm.entrypoints.pooling.scoring.typing import ScoreMultiModalParam
|
||||
|
||||
from ....conftest import HfRunner, VllmRunner
|
||||
|
||||
pytestmark = pytest.mark.skip(
|
||||
reason="jinaai/jina-reranker-m0 custom code is incompatible with "
|
||||
"transformers v5 (missing all_tied_weights_keys)"
|
||||
)
|
||||
|
||||
MODELS = ["jinaai/jina-reranker-m0"]
|
||||
|
||||
MM_PROCESSOR_KWARGS = {
|
||||
|
||||
@@ -17,11 +17,13 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from importlib.metadata import version
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from packaging.version import Version
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from tests.models.registry import HF_EXAMPLE_MODELS
|
||||
@@ -122,6 +124,11 @@ def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx):
|
||||
assert builder.get_dummy_text({"audio": 2}) == "<sound><sound>"
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
Version(version("transformers")) >= Version("5.5"),
|
||||
reason="transformers v5.5 added native MusicFlamingoForConditionalGeneration "
|
||||
"with a different get_audio_features signature (requires input_ids)",
|
||||
)
|
||||
def test_musicflamingo_audio_feature_pipeline_matches_hf_small_config():
|
||||
from transformers.models.musicflamingo import (
|
||||
modeling_musicflamingo as hf_musicflamingo_modeling,
|
||||
|
||||
+130
-9
@@ -335,7 +335,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"internlm/internlm2-chat-7b", trust_remote_code=True
|
||||
),
|
||||
"InternLM2VEForCausalLM": _HfExamplesInfo(
|
||||
"OpenGVLab/Mono-InternVL-2B", trust_remote_code=True
|
||||
"OpenGVLab/Mono-InternVL-2B",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.57",
|
||||
transformers_version_reason={
|
||||
"vllm": (
|
||||
"Custom config cannot be loaded with Transformers "
|
||||
"v5 because `vision_config` is not always set"
|
||||
)
|
||||
},
|
||||
),
|
||||
"InternLM3ForCausalLM": _HfExamplesInfo(
|
||||
"internlm/internlm3-8b-instruct", trust_remote_code=True
|
||||
@@ -475,6 +483,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"Plamo2ForCausalLM": _HfExamplesInfo(
|
||||
"pfnet/plamo-2-1b",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.57",
|
||||
transformers_version_reason={
|
||||
"hf": (
|
||||
"Custom model code uses `_tied_weight_keys: list[str]` but "
|
||||
"Transformers v5 now expects `_tied_weight_keys: dict[str, str]`"
|
||||
)
|
||||
},
|
||||
),
|
||||
"Plamo3ForCausalLM": _HfExamplesInfo(
|
||||
"pfnet/plamo-3-nict-2b-base",
|
||||
@@ -515,6 +530,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
is_available_online=True,
|
||||
max_transformers_version="5.3",
|
||||
transformers_version_reason={
|
||||
"vllm": (
|
||||
"vllm upgraded transformers above v5.4 where "
|
||||
"validate_rope() no longer accepts ignore_keys param"
|
||||
)
|
||||
},
|
||||
),
|
||||
"SeedOssForCausalLM": _HfExamplesInfo(
|
||||
"ByteDance-Seed/Seed-OSS-36B-Instruct",
|
||||
@@ -553,6 +575,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"xverse/XVERSE-7B-Chat",
|
||||
tokenizer="meta-llama/Llama-2-7b",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.57",
|
||||
transformers_version_reason={
|
||||
"vllm": "XVERSE tokenizer is incompatible with transformers v5 "
|
||||
"(add_prefix_space / prepend_scheme mismatch).",
|
||||
},
|
||||
),
|
||||
"Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
|
||||
"MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True),
|
||||
@@ -763,10 +790,18 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
# [Decoder-only]
|
||||
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
|
||||
"AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
|
||||
"nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0"
|
||||
"nvidia/audio-flamingo-3-hf",
|
||||
min_transformers_version="5.3.0",
|
||||
transformers_version_reason={
|
||||
"vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
|
||||
},
|
||||
),
|
||||
"MusicFlamingoForConditionalGeneration": _HfExamplesInfo(
|
||||
"nvidia/music-flamingo-2601-hf", min_transformers_version="5.3.0"
|
||||
"nvidia/music-flamingo-2601-hf",
|
||||
min_transformers_version="5.3.0",
|
||||
transformers_version_reason={
|
||||
"vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
|
||||
},
|
||||
),
|
||||
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
|
||||
"BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
|
||||
@@ -821,12 +856,30 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
),
|
||||
"FireRedASR2ForConditionalGeneration": _HfExamplesInfo(
|
||||
"allendou/FireRedASR2-LLM-vllm",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="5.1",
|
||||
transformers_version_reason={
|
||||
"vllm": "Incompatible with transformers v5.2+ "
|
||||
"(dict object has no attribute '__name__').",
|
||||
},
|
||||
),
|
||||
"FireRedLIDForConditionalGeneration": _HfExamplesInfo(
|
||||
"PatchyTisa/FireRedLID-vllm",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="5.1",
|
||||
transformers_version_reason={
|
||||
"vllm": "Incompatible with transformers v5.2+ "
|
||||
"(dict object has no attribute '__name__').",
|
||||
},
|
||||
),
|
||||
"FunASRForConditionalGeneration": _HfExamplesInfo(
|
||||
"allendou/Fun-ASR-Nano-2512-vllm",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="5.1",
|
||||
transformers_version_reason={
|
||||
"vllm": "Incompatible with transformers v5.2+ "
|
||||
"(dict object has no attribute '__name__').",
|
||||
},
|
||||
),
|
||||
"FunAudioChatForConditionalGeneration": _HfExamplesInfo(
|
||||
"funaudiochat", is_available_online=False
|
||||
@@ -868,6 +921,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"HCXVisionForCausalLM": _HfExamplesInfo(
|
||||
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.57",
|
||||
transformers_version_reason={
|
||||
"vllm": (
|
||||
"Custom config cannot be loaded with Transformers "
|
||||
"v5 because `text_config` is not always set"
|
||||
)
|
||||
},
|
||||
),
|
||||
"HCXVisionV2ForCausalLM": _HfExamplesInfo(
|
||||
"naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
|
||||
@@ -887,7 +947,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"},
|
||||
),
|
||||
"InternS1ForConditionalGeneration": _HfExamplesInfo(
|
||||
"internlm/Intern-S1", trust_remote_code=True
|
||||
"internlm/Intern-S1",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.57",
|
||||
transformers_version_reason={
|
||||
"vllm": "Custom tokenizer code is not compatible with Transformers v5."
|
||||
},
|
||||
),
|
||||
"InternS1ProForConditionalGeneration": _HfExamplesInfo(
|
||||
"internlm/Intern-S1-Pro",
|
||||
@@ -976,7 +1041,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"MiDashengLMModel": _HfExamplesInfo(
|
||||
"mispeech/midashenglm-7b", trust_remote_code=True
|
||||
),
|
||||
"MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", trust_remote_code=True),
|
||||
"MiniCPMO": _HfExamplesInfo(
|
||||
"openbmb/MiniCPM-o-2_6",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.57",
|
||||
transformers_version_reason={
|
||||
"hf": "Custom processor code is not compatible with Transformers v5."
|
||||
},
|
||||
),
|
||||
"MiniCPMV": _HfExamplesInfo(
|
||||
"openbmb/MiniCPM-Llama3-V-2_5",
|
||||
extras={
|
||||
@@ -984,6 +1056,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"4.0": "openbmb/MiniCPM-V-4",
|
||||
"4.5": "openbmb/MiniCPM-V-4_5",
|
||||
},
|
||||
max_transformers_version="4.57",
|
||||
transformers_version_reason={
|
||||
"vllm": (
|
||||
"MiniCPMVBatchFeature is incompatible with its base class in "
|
||||
"Transformers v5. See https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/discussions/78"
|
||||
)
|
||||
},
|
||||
trust_remote_code=True,
|
||||
),
|
||||
"MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo(
|
||||
@@ -1083,13 +1162,25 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True,
|
||||
),
|
||||
"OpenCUAForConditionalGeneration": _HfExamplesInfo(
|
||||
"xlangai/OpenCUA-7B", trust_remote_code=True
|
||||
"xlangai/OpenCUA-7B",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.57",
|
||||
transformers_version_reason={
|
||||
"vllm": "Tokenizer cannot be initialised in Transformers v5."
|
||||
},
|
||||
),
|
||||
"OpenPanguVLForConditionalGeneration": _HfExamplesInfo(
|
||||
"FreedomIntelligence/openPangu-VL-7B",
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True,
|
||||
max_transformers_version="4.57",
|
||||
transformers_version_reason={
|
||||
"vllm": (
|
||||
"OpenPanguVLVideoProcessorInitKwargs does not specify total=False, "
|
||||
"making all kwargs required. See https://huggingface.co/FreedomIntelligence/openPangu-VL-7B/discussions/2"
|
||||
)
|
||||
},
|
||||
),
|
||||
"Ovis": _HfExamplesInfo(
|
||||
"AIDC-AI/Ovis2-1B",
|
||||
@@ -1101,12 +1192,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B",
|
||||
},
|
||||
),
|
||||
"Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True),
|
||||
"Ovis2_5": _HfExamplesInfo(
|
||||
"AIDC-AI/Ovis2.5-2B",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.57",
|
||||
transformers_version_reason={
|
||||
"vllm": "Custom processor code is not compatible with Transformers v5."
|
||||
},
|
||||
),
|
||||
"Ovis2_6ForCausalLM": _HfExamplesInfo(
|
||||
"AIDC-AI/Ovis2.6-2B", is_available_online=False, trust_remote_code=True
|
||||
),
|
||||
"Ovis2_6_MoeForCausalLM": _HfExamplesInfo(
|
||||
"AIDC-AI/Ovis2.6-30B-A3B", trust_remote_code=True
|
||||
"AIDC-AI/Ovis2.6-30B-A3B",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.57",
|
||||
transformers_version_reason={
|
||||
"vllm": "Custom processor code is not compatible with Transformers v5."
|
||||
},
|
||||
),
|
||||
"PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
|
||||
"PaddlePaddle/PaddleOCR-VL",
|
||||
@@ -1126,7 +1229,17 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"},
|
||||
),
|
||||
"Phi4ForCausalLMV": _HfExamplesInfo(
|
||||
"microsoft/Phi-4-reasoning-vision-15B", trust_remote_code=True
|
||||
"microsoft/Phi-4-reasoning-vision-15B",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="5.3",
|
||||
transformers_version_reason={
|
||||
"vllm": (
|
||||
"vllm upgraded transformers above v5.4 where HF model "
|
||||
"custom code uses siglip2 internals "
|
||||
"(filter_out_non_signature_kwargs) removed "
|
||||
"by huggingface/transformers#43514"
|
||||
)
|
||||
},
|
||||
),
|
||||
"Phi4MMForCausalLM": _HfExamplesInfo(
|
||||
"microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
|
||||
@@ -1223,6 +1336,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"architectures": ["Tarsier2ForConditionalGeneration"],
|
||||
"model_type": "tarsier2",
|
||||
},
|
||||
max_transformers_version="5.3",
|
||||
transformers_version_reason={
|
||||
"vllm": (
|
||||
"Qwen2VLConfig was split into Qwen2VLConfig + "
|
||||
"Qwen2VLTextConfig in transformers v5, breaking "
|
||||
"attribute access (num_attention_heads, hidden_size, etc.)"
|
||||
)
|
||||
},
|
||||
),
|
||||
"VoxtralForConditionalGeneration": _HfExamplesInfo(
|
||||
"mistralai/Voxtral-Mini-3B-2507",
|
||||
|
||||
+10
-1
@@ -476,7 +476,16 @@ def dummy_hf_overrides(
|
||||
else:
|
||||
# Use minimal layers for testing
|
||||
num_layers = 1
|
||||
num_hidden_layers = 3 if model_arch == "Gemma3nForConditionalGeneration" else 1
|
||||
num_hidden_layers = (
|
||||
3
|
||||
if model_arch
|
||||
in (
|
||||
"Gemma3nForConditionalGeneration",
|
||||
"Gemma4ForCausalLM",
|
||||
"Gemma4ForConditionalGeneration",
|
||||
)
|
||||
else 1
|
||||
)
|
||||
|
||||
update_dict = {
|
||||
"num_layers": num_layers,
|
||||
|
||||
@@ -2,10 +2,10 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.reasoning.utils import run_reasoning_extraction
|
||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
parser_name = "step3p5"
|
||||
start_token = "<think>"
|
||||
@@ -16,7 +16,7 @@ REASONING_MODEL_NAME = "stepfun-ai/Step-3.5-Flash"
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def step3p5_tokenizer():
|
||||
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
|
||||
return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME)
|
||||
|
||||
|
||||
SIMPLE_REASONING = {
|
||||
|
||||
@@ -557,12 +557,16 @@ def test_eagle_correctness_light(
|
||||
"auto",
|
||||
0.8,
|
||||
),
|
||||
(
|
||||
pytest.param(
|
||||
("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1),
|
||||
False,
|
||||
False,
|
||||
"transformers",
|
||||
0.8,
|
||||
# TODO(hmellor): figure out why memory usage is so high
|
||||
marks=pytest.mark.skip(
|
||||
reason="Feature is experimental and uses too much memory in CI",
|
||||
),
|
||||
),
|
||||
pytest.param(
|
||||
(
|
||||
|
||||
@@ -265,12 +265,24 @@ class GGUFModelLoader(BaseModelLoader):
|
||||
GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight')
|
||||
or None if no mapping found
|
||||
"""
|
||||
# In transformers v5, multimodal models (e.g. Gemma3) wrap
|
||||
# all sub-models under an outer 'model.' attribute, producing
|
||||
# state_dict keys like 'model.language_model.layers.0...' and
|
||||
# 'model.vision_tower.vision_model...'. Strip this outer
|
||||
# prefix so the keys match what gguf-py expects.
|
||||
if is_multimodal and hf_name.startswith("model."):
|
||||
hf_name = hf_name[6:] # Remove outer 'model.'
|
||||
|
||||
# Strip 'language_model.' prefix for multimodal models - gguf-py
|
||||
# tensor mappings expect parameter names without this prefix.
|
||||
# Note: 'model.' prefix should be KEPT for text-only models as
|
||||
# gguf-py expects it.
|
||||
if hf_name.startswith("language_model."):
|
||||
hf_name = hf_name[15:] # Remove 'language_model.'
|
||||
# Re-add 'model.' prefix because gguf-py text tensor maps
|
||||
# expect 'model.layers...' format.
|
||||
if is_multimodal:
|
||||
hf_name = "model." + hf_name
|
||||
|
||||
# Parse parameter name and suffix
|
||||
if hf_name.endswith((".weight", ".bias")):
|
||||
|
||||
@@ -125,8 +125,12 @@ class Gemma4AudioInputs(TensorSchema):
|
||||
"""
|
||||
|
||||
type: Literal["audio"] = "audio"
|
||||
input_features_padded: Annotated[torch.Tensor, TensorShape("bn", "s", "f")]
|
||||
input_features_mask: Annotated[torch.Tensor, TensorShape("bn", "s")]
|
||||
input_features_padded: Annotated[
|
||||
torch.Tensor, TensorShape("bn", "s", "f", dynamic_dims={"s"})
|
||||
]
|
||||
input_features_mask: Annotated[
|
||||
torch.Tensor, TensorShape("bn", "s", dynamic_dims={"s"})
|
||||
]
|
||||
|
||||
|
||||
Gemma4ImageInputs = Gemma4ImagePixelInputs
|
||||
@@ -510,6 +514,8 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
|
||||
video_timestamps_per_video: list[list[float]] = []
|
||||
video_frame_counts: list[int] = []
|
||||
|
||||
video_replacements: list[str] = []
|
||||
|
||||
for item in videos:
|
||||
video_array, metadata = item
|
||||
|
||||
@@ -562,10 +568,7 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
|
||||
video_timestamps_per_video.append(timestamps)
|
||||
video_frame_counts.append(len(frames))
|
||||
|
||||
# Build expanded replacement text and replace the
|
||||
# <|video|> placeholder in the prompt.
|
||||
# Use split(token, 1) to avoid collision — the
|
||||
# replacement text itself contains <|video|> tokens.
|
||||
# Build expanded replacement text for this video.
|
||||
ts_strs = [f"{int(s // 60):02d}:{int(s % 60):02d}" for s in timestamps]
|
||||
replacement = " ".join(
|
||||
f"{t} {processor.boi_token}"
|
||||
@@ -573,9 +576,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
|
||||
f"{processor.eoi_token}"
|
||||
for t, n in zip(ts_strs, num_soft_per_frame)
|
||||
)
|
||||
parts = prompt.split(processor.video_token, 1)
|
||||
if len(parts) == 2:
|
||||
prompt = parts[0] + replacement + parts[1]
|
||||
video_replacements.append(replacement)
|
||||
|
||||
# Replace all <|video|> placeholders at once. We split on
|
||||
# video_token to get N+1 parts, then interleave with the
|
||||
# N replacement strings. This avoids the iterative
|
||||
# split-replace bug where replacement text (which itself
|
||||
# contains <|video|> tokens) collides with later splits.
|
||||
vt = processor.video_token
|
||||
parts = prompt.split(vt, len(video_replacements))
|
||||
|
||||
# NOTE: len(parts) <= len(video_replacements) + 1
|
||||
parts_with_repl: list[str] = []
|
||||
for part, repl in zip(parts, video_replacements):
|
||||
parts_with_repl.extend([part, repl])
|
||||
parts_with_repl.extend(parts[len(video_replacements) :])
|
||||
|
||||
prompt = "".join(parts_with_repl)
|
||||
|
||||
video_outputs = {
|
||||
"pixel_values_videos": torch.cat(all_video_pixel_values, dim=0),
|
||||
@@ -638,19 +655,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
|
||||
)
|
||||
|
||||
if "input_features" in processed_outputs:
|
||||
# Keep padded features for batched audio tower execution.
|
||||
processed_outputs["input_features_padded"] = processed_outputs[
|
||||
"input_features"
|
||||
]
|
||||
# Unpad per-item so each item's cache entry is self-contained.
|
||||
# Unpad per-item so each item's cache entry is
|
||||
# self-contained. The batched() field config in
|
||||
# _get_mm_fields_config will re-pad all fields to the
|
||||
# batch's max length at batch time, ensuring consistent
|
||||
# padding regardless of cache history.
|
||||
masks = processed_outputs["input_features_mask"]
|
||||
unpadded_features = [
|
||||
f[mask]
|
||||
for f, mask in zip(
|
||||
processed_outputs["input_features"],
|
||||
processed_outputs["input_features_mask"],
|
||||
masks,
|
||||
)
|
||||
]
|
||||
unpadded_masks = [mask[mask] for mask in masks]
|
||||
processed_outputs["input_features"] = unpadded_features
|
||||
processed_outputs["input_features_padded"] = unpadded_features
|
||||
processed_outputs["input_features_mask"] = unpadded_masks
|
||||
|
||||
# Merge video outputs into the final result
|
||||
combined_outputs = dict(processed_outputs, **video_outputs)
|
||||
|
||||
@@ -32,9 +32,9 @@ from transformers.models.musicflamingo import (
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.inputs import MultiModalDataDict
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargsItems,
|
||||
)
|
||||
|
||||
@@ -275,6 +275,11 @@ class Base(
|
||||
)
|
||||
class SupportTorchCompileWrapper(cls): ...
|
||||
|
||||
# Preserve __module__ so transformers v5's source-file checks
|
||||
# (e.g. _can_set_experts_implementation) read the original
|
||||
# model's module instead of this file.
|
||||
SupportTorchCompileWrapper.__module__ = cls.__module__
|
||||
|
||||
# Patch the class in its module
|
||||
module = sys.modules[cls.__module__]
|
||||
setattr(module, cls.__name__, SupportTorchCompileWrapper)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import contextlib
|
||||
from dataclasses import dataclass, field
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
@@ -10,6 +11,7 @@ from typing_extensions import TypeVar, assert_never
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.transformers_utils.gguf_utils import (
|
||||
check_gguf_file,
|
||||
get_gguf_file_path_from_hf,
|
||||
@@ -31,6 +33,13 @@ if TYPE_CHECKING:
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
# Model types whose hub tokenizer_class is incorrect and should be overridden with
|
||||
# TokenizersBackend (the generic fast tokenizer). Adding a model type here is always a
|
||||
# temporary workaround and better long term solutions are:
|
||||
# - Add model type to MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS in transformers (better)
|
||||
# - Fix tokenizer_class on the hub for the affected models (best)
|
||||
_MODEL_TYPES_WITH_INCORRECT_TOKENIZER_CLASS: set[str] = {"step3_vl"}
|
||||
|
||||
_VLLM_TOKENIZERS = {
|
||||
"deepseek_v32": ("deepseek_v32", "DeepseekV32Tokenizer"),
|
||||
"grok2": ("grok2", "Grok2Tokenizer"),
|
||||
@@ -202,7 +211,31 @@ def get_tokenizer(
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if tokenizer_cls == TokenizerLike:
|
||||
# Ensure that, if the config were to come from vllm.transformers_utils.config, it is
|
||||
# registered with AutoConfig before the tokenizer is loaded. This is necessary since
|
||||
# tokenizer_cls_.from_pretrained will call AutoConfig.from_pretrained internally.
|
||||
# This may fail for paths that don't have a model config (e.g. LoRA adapters),
|
||||
# which is fine — those don't need custom config registration.
|
||||
config = None
|
||||
with contextlib.suppress(ValueError, OSError):
|
||||
config = get_config(
|
||||
tokenizer_name,
|
||||
trust_remote_code=trust_remote_code,
|
||||
revision=revision,
|
||||
)
|
||||
|
||||
# Some models have an incorrect tokenizer_class on the hub.
|
||||
# For these model types, bypass AutoTokenizer and use TokenizersBackend directly.
|
||||
model_type = getattr(config, "model_type", None) if config else None
|
||||
if model_type in _MODEL_TYPES_WITH_INCORRECT_TOKENIZER_CLASS:
|
||||
from transformers.tokenization_utils_tokenizers import TokenizersBackend
|
||||
|
||||
logger.debug(
|
||||
"Overriding tokenizer_class to TokenizersBackend for model_type=%r",
|
||||
model_type,
|
||||
)
|
||||
tokenizer_cls_ = TokenizersBackend
|
||||
elif tokenizer_cls == TokenizerLike:
|
||||
tokenizer_cls_ = TokenizerRegistry.load_tokenizer_cls(tokenizer_mode)
|
||||
else:
|
||||
tokenizer_cls_ = tokenizer_cls
|
||||
|
||||
Reference in New Issue
Block a user