[Mypy] Better fixes for the mypy issues in vllm/config (#37902)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2026-03-25 13:14:43 +00:00
committed by GitHub
parent 34d317dcec
commit d215d1efca
35 changed files with 153 additions and 182 deletions
@@ -42,7 +42,6 @@ details.
import random
import time
from dataclasses import fields
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
@@ -124,7 +123,7 @@ def main(args):
# Create the LLM engine
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
print("------warm up------")
+1 -2
View File
@@ -32,7 +32,6 @@ import dataclasses
import json
import random
import time
from dataclasses import fields
from transformers import PreTrainedTokenizerBase
@@ -197,7 +196,7 @@ def main(args):
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
sampling_params = SamplingParams(
temperature=0,
+1 -2
View File
@@ -6,7 +6,6 @@ import argparse
import json
import random
import time
from dataclasses import fields
from transformers import AutoTokenizer, PreTrainedTokenizerBase
@@ -79,7 +78,7 @@ def run_vllm(
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
assert all(
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
+1 -2
View File
@@ -9,7 +9,6 @@ on HuggingFace model repository.
"""
import os
from dataclasses import asdict
from typing import Any, NamedTuple
from huggingface_hub import snapshot_download
@@ -633,7 +632,7 @@ def main(args):
req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
engine_args = vars(req_data.engine_args) | {"seed": args.seed}
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args)
@@ -8,7 +8,6 @@ the explicit/implicit prompt format on enc-dec LMMs for text generation.
import os
import time
from collections.abc import Sequence
from dataclasses import asdict
from typing import NamedTuple
from vllm import LLM, EngineArgs, PromptType, SamplingParams
@@ -91,13 +90,12 @@ def main(args):
req_data = model_example_map[model]()
# Disable other modalities to save memory
engine_args = req_data.engine_args
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)
limit_mm_per_prompt = default_limits | (engine_args.limit_mm_per_prompt or {})
engine_args.limit_mm_per_prompt = limit_mm_per_prompt
engine_args.seed = args.seed
llm = LLM.from_engine_args(engine_args)
prompts = req_data.prompts
@@ -20,8 +20,6 @@ python load_sharded_state.py \
--max-tokens 50
"""
import dataclasses
from vllm import LLM, EngineArgs, SamplingParams
from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -64,7 +62,7 @@ def main():
print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
# Load the model using engine args
llm = LLM(**dataclasses.asdict(engine_args))
llm = LLM.from_engine_args(engine_args)
# Prepare sampling parameters
sampling_params = SamplingParams(
@@ -21,7 +21,6 @@ llm = LLM(
)
"""
import dataclasses
import os
import shutil
from pathlib import Path
@@ -60,7 +59,7 @@ def main(args):
if not Path(model_path).is_dir():
raise ValueError("model path must be a local directory")
# Create LLM instance from arguments
llm = LLM(**dataclasses.asdict(engine_args))
llm = LLM.from_engine_args(engine_args)
# Prepare output directory
Path(args.output).mkdir(exist_ok=True)
# Dump worker states to output directory
@@ -11,7 +11,6 @@ on HuggingFace model repository.
import os
import random
from contextlib import contextmanager
from dataclasses import asdict
from typing import NamedTuple
from huggingface_hub import snapshot_download
@@ -2434,13 +2433,13 @@ def main(args):
req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {
"seed": args.seed,
"mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
}
engine_args = req_data.engine_args
engine_args.seed = args.seed
mm_processor_cache_gb = 0 if args.disable_mm_processor_cache else 4
engine_args.mm_processor_cache_gb = mm_processor_cache_gb
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args)
engine_args.tensor_parallel_size = args.tensor_parallel_size
llm = LLM.from_engine_args(engine_args)
# Don't want to check the flag multiple times, so just hijack `prompts`.
prompts = (
@@ -8,7 +8,6 @@ using the chat template defined by the model.
import os
from argparse import Namespace
from dataclasses import asdict
from typing import NamedTuple
from huggingface_hub import snapshot_download
@@ -1481,10 +1480,11 @@ def run_generate(
):
req_data = model_example_map[model](question, image_urls)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
engine_args = req_data.engine_args
engine_args.seed = seed
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args)
engine_args.tensor_parallel_size = tensor_parallel_size
llm = LLM.from_engine_args(engine_args)
sampling_params = SamplingParams(
temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
@@ -1521,10 +1521,11 @@ def run_chat(
req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
engine_args = req_data.engine_args
engine_args.seed = seed
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args)
engine_args.tensor_parallel_size = tensor_parallel_size
llm = LLM.from_engine_args(engine_args)
sampling_params = (
SamplingParams(
@@ -10,12 +10,11 @@ on HuggingFace model repository.
"""
import argparse
from dataclasses import asdict
from pathlib import Path
from PIL.Image import Image
from vllm import LLM, EngineArgs
from vllm import LLM
from vllm.multimodal.utils import fetch_image
from vllm.utils.print_utils import print_embeddings
@@ -28,14 +27,13 @@ multi_modal_data = {"image": fetch_image(image_url)}
def run_clip(seed: int):
engine_args = EngineArgs(
llm = LLM(
model="openai/clip-vit-base-patch32",
runner="pooling",
limit_mm_per_prompt={"image": 1},
seed=seed,
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
print("Text embedding output:")
outputs = llm.embed(text, use_tqdm=False)
print_embeddings(outputs[0].outputs.embedding)
@@ -53,15 +51,14 @@ def run_clip(seed: int):
def run_e5_v(seed: int):
engine_args = EngineArgs(
llm = LLM(
model="royokong/e5-v",
runner="pooling",
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
seed=seed,
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
print("Text embedding output:")
@@ -108,20 +105,20 @@ def run_qwen3_vl(seed: int):
multi_modal_data["image"] = post_process_image(multi_modal_data["image"])
engine_args = EngineArgs(
model="Qwen/Qwen3-VL-Embedding-2B",
runner="pooling",
max_model_len=8192,
limit_mm_per_prompt={"image": 1},
mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
)
default_instruction = "Represent the user's input."
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
prompt_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
prompt_image = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
prompt_image_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
llm = LLM(**asdict(engine_args) | {"seed": seed})
llm = LLM(
model="Qwen/Qwen3-VL-Embedding-2B",
runner="pooling",
max_model_len=8192,
limit_mm_per_prompt={"image": 1},
mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
seed=seed,
)
print("Text embedding output:")
outputs = llm.embed(prompt_text, use_tqdm=False)
@@ -149,14 +146,13 @@ def run_qwen3_vl(seed: int):
def run_siglip(seed: int):
engine_args = EngineArgs(
llm = LLM(
model="google/siglip-base-patch16-224",
runner="pooling",
limit_mm_per_prompt={"image": 1},
seed=seed,
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
print("Text embedding output:")
outputs = llm.embed(text, use_tqdm=False)
print_embeddings(outputs[0].outputs.embedding)
@@ -174,16 +170,15 @@ def run_siglip(seed: int):
def run_vlm2vec_phi3v(seed: int):
engine_args = EngineArgs(
llm = LLM(
model="TIGER-Lab/VLM2Vec-Full",
runner="pooling",
max_model_len=4096,
trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4},
limit_mm_per_prompt={"image": 1},
seed=seed,
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
image_token = "<|image_1|>"
print("Text embedding output:")
@@ -259,7 +254,7 @@ def run_vlm2vec_qwen2vl(seed: int):
processor.save_pretrained(merged_path)
print("Done!")
engine_args = EngineArgs(
llm = LLM(
model=merged_path,
runner="pooling",
max_model_len=4096,
@@ -268,9 +263,8 @@ def run_vlm2vec_qwen2vl(seed: int):
"max_pixels": 12845056,
},
limit_mm_per_prompt={"image": 1},
seed=seed,
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
image_token = "<|image_pad|>"
print("Text embedding output:")
@@ -10,7 +10,6 @@ multimodal documents (text + images/videos).
from argparse import Namespace
from collections.abc import Callable
from dataclasses import asdict
from pathlib import Path
from typing import NamedTuple
@@ -125,7 +124,7 @@ def main(args: Namespace):
model_request = model_example_map[args.model_name]()
engine_args = model_request.engine_args
llm = LLM(**asdict(engine_args))
llm = LLM.from_engine_args(engine_args)
print("Query: string & Document: string")
outputs = llm.score(query, document)
+6 -2
View File
@@ -414,9 +414,12 @@ def test_cudagraph_sizes_post_init(
ctx,
patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size),
):
kwargs = {}
if cudagraph_capture_sizes is not None:
kwargs["cudagraph_capture_sizes"] = cudagraph_capture_sizes
if max_cudagraph_capture_size is not None:
kwargs["max_cudagraph_capture_size"] = max_cudagraph_capture_size
compilation_config = CompilationConfig(
cudagraph_capture_sizes=cudagraph_capture_sizes,
max_cudagraph_capture_size=max_cudagraph_capture_size,
pass_config=PassConfig(
enable_sp=enable_sp,
fuse_norm_quant=True,
@@ -425,6 +428,7 @@ def test_cudagraph_sizes_post_init(
sp_min_token_num=512 if enable_sp else None,
),
cudagraph_mode=cudagraph_mode,
**kwargs,
)
engine_args = EngineArgs(
model="facebook/opt-125m",
@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for HF_HUB_OFFLINE mode"""
import dataclasses
import importlib
import sys
@@ -12,7 +11,6 @@ import urllib3
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import EngineArgs
MODEL_CONFIGS = [
{
@@ -160,8 +158,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
# Need to re-import huggingface_hub
# and friends to set up offline mode
_re_import_modules()
engine_args = EngineArgs(model="facebook/opt-125m")
LLM(**dataclasses.asdict(engine_args))
LLM(model="facebook/opt-125m")
finally:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import asdict
from typing import NamedTuple
import pytest
@@ -29,14 +28,6 @@ def test_keye_vl(image_assets, question: str):
images = [asset.pil_image for asset in image_assets]
image_urls = [encode_image_url(image) for image in images]
engine_args = EngineArgs(
model=MODEL_NAME,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
@@ -54,8 +45,14 @@ def test_keye_vl(image_assets, question: str):
messages, tokenize=False, add_generation_prompt=True
)
engine_args = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_args)
llm = LLM(
model=MODEL_NAME,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
seed=42,
)
sampling_params = SamplingParams(
temperature=0.0, max_tokens=256, stop_token_ids=None
@@ -7,13 +7,12 @@ This test validates that each multimodal model can successfully generate outputs
using different ViT attention backends. Tests are parametrized by model and backend.
"""
from dataclasses import asdict
from typing import Any
import pytest
from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams
from vllm import LLM, SamplingParams
from vllm.multimodal.utils import encode_image_url
from vllm.multimodal.video import sample_frames_from_video
from vllm.platforms import current_platform
@@ -274,7 +273,7 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})
# Create engine
engine_args = EngineArgs(
llm = LLM(
model=config["model_name"],
trust_remote_code=True,
max_model_len=config["max_model_len"],
@@ -283,11 +282,9 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
mm_encoder_attn_backend=mm_encoder_attn_backend,
hf_overrides=dummy_hf_overrides,
load_format="dummy",
seed=42,
)
engine_dict = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_dict)
# Generate
sampling_params = SamplingParams(**config["sampling_params"])
outputs = llm.generate(
@@ -318,7 +315,7 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
messages = build_dots_ocr_prompt([stop_sign_image], config)
# Create engine
engine_args = EngineArgs(
llm = LLM(
model=config["model_name"],
trust_remote_code=True,
max_model_len=config["max_model_len"],
@@ -327,11 +324,9 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
mm_encoder_attn_backend=mm_encoder_attn_backend,
hf_overrides=dummy_hf_overrides,
load_format="dummy",
seed=42,
)
engine_dict = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_dict)
# Generate using chat
sampling_params = SamplingParams(**config["sampling_params"])
outputs = llm.chat(messages=messages, sampling_params=sampling_params)
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
from dataclasses import asdict
import pytest
import pytest_asyncio
@@ -75,7 +74,7 @@ def tokenizer() -> MistralTokenizer:
@pytest.fixture
def engine():
engine_args = EngineArgs(**ENGINE_CONFIG)
llm = LLM(**asdict(engine_args))
llm = LLM.from_engine_args(engine_args)
try:
yield llm
finally:
@@ -1,12 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import asdict
from typing import NamedTuple
import pytest
from PIL import Image
from vllm import LLM, EngineArgs, SamplingParams
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.config import AttentionConfig, KVTransferConfig
from vllm.multimodal.utils import encode_image_url
@@ -129,24 +128,6 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend):
# Using tmp_path as the storage path to store KV
print(f"KV storage path at: {str(tmp_path)}")
# Configure the ExampleConnector
kv_transfer_config = KVTransferConfig(
kv_connector="ExampleConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
)
engine_args = EngineArgs(
model=MODEL_NAME,
max_model_len=8192,
max_num_seqs=1,
gpu_memory_utilization=0.4,
attention_config=AttentionConfig(backend=attn_backend),
enforce_eager=True,
kv_transfer_config=kv_transfer_config,
limit_mm_per_prompt={"image": 2},
)
# don't put this import at the top level
# it will call torch.accelerator.device_count()
from transformers import AutoProcessor
@@ -163,8 +144,20 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend):
assert image_1 != image_2, "The images should not be identical"
# Create the LLM instance
engine_args = asdict(engine_args)
llm = LLM(**engine_args)
llm = LLM(
model=MODEL_NAME,
max_model_len=8192,
max_num_seqs=1,
gpu_memory_utilization=0.4,
attention_config=AttentionConfig(backend=attn_backend),
enforce_eager=True,
kv_transfer_config=KVTransferConfig(
kv_connector="ExampleConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
),
limit_mm_per_prompt={"image": 2},
)
# Prepare the input cases
input_cases = [
+1 -2
View File
@@ -6,7 +6,6 @@ import argparse
import json
import os
import time
from dataclasses import fields
from typing import Any
import numpy as np
@@ -85,7 +84,7 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
assert llm.llm_engine.model_config.max_model_len >= (
args.input_len + args.output_len
), (
+1 -2
View File
@@ -17,7 +17,6 @@ import argparse
import json
import time
from collections import defaultdict
from dataclasses import fields
from datetime import datetime
from typing import TYPE_CHECKING, Any, Literal
@@ -225,7 +224,7 @@ def benchmark_multimodal_processor(
args.seed = 0
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
tokenizer = llm.get_tokenizer()
requests = get_requests(args, tokenizer)
+1 -2
View File
@@ -16,7 +16,6 @@ import shutil
import tempfile
import time
from contextlib import contextmanager
from dataclasses import fields
from typing import Any
import numpy as np
@@ -67,7 +66,7 @@ def run_startup_in_subprocess(engine_args, result_queue):
# Measure total startup time
start_time = time.perf_counter()
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
total_startup_time = time.perf_counter() - start_time
+2 -3
View File
@@ -8,7 +8,6 @@ import os
import random
import time
import warnings
from dataclasses import fields
from typing import Any
import torch
@@ -53,7 +52,7 @@ def run_vllm(
) -> tuple[float, list[RequestOutput] | None]:
from vllm import LLM, SamplingParams
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
assert all(
llm.llm_engine.model_config.max_model_len
>= (request.prompt_len + request.expected_output_len)
@@ -141,7 +140,7 @@ def run_vllm_chat(
"""
from vllm import LLM, SamplingParams
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
assert all(
llm.llm_engine.model_config.max_model_len
+15 -20
View File
@@ -116,29 +116,29 @@ class PassConfig:
"""
# New flags
fuse_norm_quant: bool | None = Field(default=None)
fuse_norm_quant: bool = None # type: ignore[assignment]
"""Fuse the custom RMSNorm + quant ops."""
fuse_act_quant: bool | None = Field(default=None)
fuse_act_quant: bool = None # type: ignore[assignment]
"""Fuse the custom SiluMul + quant ops."""
fuse_attn_quant: bool | None = Field(default=None)
fuse_attn_quant: bool = None # type: ignore[assignment]
"""Fuse the custom attention + quant ops."""
eliminate_noops: bool = Field(default=True)
"""Eliminate no-op ops."""
enable_sp: bool | None = Field(default=None)
enable_sp: bool = None # type: ignore[assignment]
"""Enable sequence parallelism. Requires TP>1. Automatically disabled
if the model's hidden_size is too small for SP to be beneficial
(threshold is device-capability dependent)."""
fuse_gemm_comms: bool | None = Field(default=None)
fuse_gemm_comms: bool = None # type: ignore[assignment]
"""Enable async TP."""
fuse_allreduce_rms: bool | None = Field(default=None)
fuse_allreduce_rms: bool = None # type: ignore[assignment]
"""Enable flashinfer allreduce fusion."""
enable_qk_norm_rope_fusion: bool = False
"""Enable fused Q/K RMSNorm + RoPE pass."""
# ROCm/AITER specific fusions
fuse_act_padding: bool | None = Field(default=None)
fuse_act_padding: bool = None # type: ignore[assignment]
"""Fuse the custom RMSNorm + padding ops."""
fuse_rope_kvcache: bool | None = Field(default=None)
fuse_rope_kvcache: bool = None # type: ignore[assignment]
"""Fuse the QK rope + KV cache ops."""
rope_kvcache_fusion_max_token_num: int = 256
@@ -405,7 +405,7 @@ class CompilationConfig:
"""
# Top-level Compilation control
mode: CompilationMode = Field(default=None) # type: ignore[assignment]
mode: CompilationMode = None # type: ignore[assignment]
"""The compilation approach used for torch.compile-based compilation of the
model.
@@ -545,7 +545,7 @@ class CompilationConfig:
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
# CudaGraph compilation
cudagraph_mode: CUDAGraphMode = Field(default=None) # type: ignore[assignment]
cudagraph_mode: CUDAGraphMode = None # type: ignore[assignment]
"""
The mode of the cudagraph:
@@ -586,7 +586,7 @@ class CompilationConfig:
It means the first several runs will be treated as warmup runs.
Only after that, the execution will be recorded, and the recorded
cudagraph will be used for subsequent runs."""
cudagraph_capture_sizes: list[int] | None = None
cudagraph_capture_sizes: list[int] = None # type: ignore[assignment]
"""Sizes to capture cudagraph.
- None (default): capture sizes are inferred from vllm config.
- list[int]: capture sizes are specified as given."""
@@ -607,7 +607,7 @@ class CompilationConfig:
When `enable_lora` is False, this option has no effect.
"""
use_inductor_graph_partition: bool = Field(default=None) # type: ignore[assignment]
use_inductor_graph_partition: bool = None # type: ignore[assignment]
"""Use inductor graph partition to split the graph at cudagraph_unsafe ops.
This partition happens at inductor codegen time after all passes and fusions
are finished. It generates a single `call` function which wraps
@@ -630,7 +630,7 @@ class CompilationConfig:
pass_config: PassConfig = field(default_factory=PassConfig)
"""Custom inductor passes, see PassConfig for more details"""
max_cudagraph_capture_size: int | None = field(default=None)
max_cudagraph_capture_size: int = None # type: ignore[assignment]
"""The maximum cudagraph capture size.
If cudagraph_capture_sizes is specified, this will be set to the largest
@@ -750,7 +750,7 @@ class CompilationConfig:
return hash_factors(factors)
def __repr__(self) -> str:
exclude = {
exclude: dict[str, bool | dict[str, bool]] = {
"static_forward_context": True,
"enabled_custom_ops": True,
"disabled_custom_ops": True,
@@ -770,9 +770,7 @@ class CompilationConfig:
exclude["pass_config"] = pass_config_exclude
config = TypeAdapter(CompilationConfig).dump_python(
self,
exclude=exclude, # type: ignore[arg-type]
exclude_unset=True,
self, exclude=exclude, exclude_unset=True
)
return str(config)
@@ -1023,7 +1021,6 @@ class CompilationConfig:
"Unrecognized size type in compile_sizes, "
f"expect 'cudagraph_capture_sizes', got {x}"
)
assert self.cudagraph_capture_sizes is not None
computed_compile_sizes.extend(self.cudagraph_capture_sizes)
else:
assert isinstance(x, int)
@@ -1031,7 +1028,6 @@ class CompilationConfig:
self.compile_sizes = computed_compile_sizes # type: ignore
# make sure the sizes are in ascending order
assert self.cudagraph_capture_sizes is not None
self.cudagraph_capture_sizes.sort()
if self.cudagraph_capture_sizes:
assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
@@ -1123,7 +1119,6 @@ class CompilationConfig:
def set_splitting_ops_for_attn_fusion(self):
assert self.pass_config.fuse_attn_quant
assert self.cudagraph_mode is not None
if self.splitting_ops is None:
self.splitting_ops = []
if self.cudagraph_mode.has_piecewise_cudagraphs():
+2 -2
View File
@@ -13,8 +13,8 @@ from vllm.utils.hashing import safe_hash
Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
@config(config=ConfigDict(arbitrary_types_allowed=True)) # type: ignore[arg-type,misc]
class DeviceConfig: # type: ignore[misc]
@config(config=ConfigDict(arbitrary_types_allowed=True))
class DeviceConfig:
"""Configuration for the device to use for vLLM execution."""
device: SkipValidation[Device | torch.device | None] = "auto"
+2 -2
View File
@@ -4,7 +4,7 @@
from collections.abc import Callable
from typing import Any, Literal
from pydantic import Field, field_validator
from pydantic import field_validator
from vllm.config.utils import config
from vllm.utils.hashing import safe_hash
@@ -26,7 +26,7 @@ MoEBackend = Literal[
class KernelConfig:
"""Configuration for kernel selection and warmup behavior."""
enable_flashinfer_autotune: bool | None = Field(default=None)
enable_flashinfer_autotune: bool = None # type: ignore[assignment]
"""If True, run FlashInfer autotuning during kernel warmup."""
moe_backend: MoEBackend = "auto"
+1 -3
View File
@@ -4,8 +4,6 @@
from typing import Literal
from pydantic import Field
from vllm.config.utils import config
@@ -18,7 +16,7 @@ class KVEventsConfig:
Events can be published externally by zmq using the event publisher config.
"""
publisher: Literal["null", "zmq"] | None = Field(default=None)
publisher: Literal["null", "zmq"] = None # type: ignore[assignment]
"""The publisher to use for publishing kv events. Can be "null", "zmq".
"""
+2 -2
View File
@@ -25,8 +25,8 @@ MaxLoRARanks = Literal[1, 8, 16, 32, 64, 128, 256, 320, 512]
LoRAExtraVocabSize = Literal[256, 512]
@config(config=ConfigDict(arbitrary_types_allowed=True)) # type: ignore[arg-type,misc]
class LoRAConfig: # type: ignore[misc]
@config(config=ConfigDict(arbitrary_types_allowed=True))
class LoRAConfig:
"""Configuration for LoRA."""
max_lora_rank: MaxLoRARanks = 16
+6 -6
View File
@@ -102,8 +102,8 @@ AttnTypeStr = Literal[
]
@config(config=ConfigDict(arbitrary_types_allowed=True)) # type: ignore[arg-type,misc]
class ModelConfig: # type: ignore[misc]
@config(config=ConfigDict(arbitrary_types_allowed=True))
class ModelConfig:
"""Configuration for the model."""
model: str = "Qwen/Qwen3-0.6B"
@@ -121,7 +121,7 @@ class ModelConfig: # type: ignore[misc]
"""Convert the model using adapters defined in
[vllm.model_executor.models.adapters][]. The most common use case is to
adapt a text generation model to be used for pooling tasks."""
tokenizer: str = Field(default=None) # type: ignore[assignment]
tokenizer: str = None # type: ignore[assignment]
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode: TokenizerMode | str = "auto"
@@ -583,7 +583,7 @@ class ModelConfig: # type: ignore[misc]
self.dtype,
is_pooling_model=self.runner_type == "pooling",
revision=self.revision,
config_format=self.config_format, # type: ignore[arg-type]
config_format=self.config_format,
)
self.original_max_model_len = self.max_model_len
@@ -733,7 +733,7 @@ class ModelConfig: # type: ignore[misc]
@property
def architectures(self) -> list[str]:
return self.model_arch_config.architectures # type: ignore[return-value]
return self.model_arch_config.architectures
@property
def architecture(self) -> str:
@@ -1944,7 +1944,7 @@ def _get_and_verify_dtype(
*,
is_pooling_model: bool,
revision: str | None = None,
config_format: ConfigFormat = "hf",
config_format: str | ConfigFormat = "hf",
) -> torch.dtype:
config_dtype = ModelArchConfigConvertorBase.get_torch_dtype(
config, model_id, revision=revision, config_format=config_format
+1 -1
View File
@@ -16,7 +16,7 @@ class ModelArchitectureConfig:
Configuration for model architecture that required by vLLM runtime
"""
architectures: list[str] | None
architectures: list[str]
"""List of model architecture class names (e.g., ['LlamaForCausalLM']).
It can be None upon calling `vllm_config.with_hf_config(config.text_config)`"""
+1 -1
View File
@@ -194,7 +194,7 @@ class ParallelConfig:
threshold, microbatching will be used. Otherwise, the request will be
processed in a single batch."""
disable_nccl_for_dp_synchronization: bool | None = Field(default=None)
disable_nccl_for_dp_synchronization: bool | None = None
"""Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py
to use Gloo instead of NCCL for its all reduce.
+3 -3
View File
@@ -52,7 +52,7 @@ class SchedulerConfig:
In real usage, this should be set in `EngineArgs.create_engine_config`.
"""
max_num_scheduled_tokens: int | None = Field(default=None)
max_num_scheduled_tokens: int | None = None
"""Maximum number of tokens that the scheduler may issue in a single iteration.
This is usually equal to max_num_batched_tokens, but can be smaller in cases
@@ -122,7 +122,7 @@ class SchedulerConfig:
# scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
# (default) or "mod.custom_class".
scheduler_cls: str | type[object] | None = Field(default=None)
scheduler_cls: str | type[object] | None = None
"""The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
the default scheduler. Can be a class directly or the path to a class of
form "mod.custom_class"."""
@@ -141,7 +141,7 @@ class SchedulerConfig:
checking the first chunk. Prevents over-admission and KV cache thrashing
with chunked prefill."""
async_scheduling: bool | None = Field(default=None)
async_scheduling: bool | None = None
"""If set to False, disable async scheduling. Async scheduling helps to
avoid gaps in GPU utilization, leading to better latency and throughput.
"""
+14 -4
View File
@@ -11,13 +11,13 @@ import os
import pathlib
import textwrap
from collections.abc import Callable, Mapping, Sequence, Set
from dataclasses import MISSING, dataclass, field, fields, is_dataclass
from dataclasses import MISSING, field, fields, is_dataclass
from itertools import pairwise
from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast
from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast, overload
import torch
from pydantic import ConfigDict
from pydantic.dataclasses import dataclass as pydantic_dataclass
from pydantic.dataclasses import dataclass
from pydantic.fields import Field as PydanticField
from pydantic.fields import FieldInfo
from typing_extensions import dataclass_transform, runtime_checkable
@@ -36,6 +36,16 @@ ConfigType = type[DataclassInstance]
ConfigT = TypeVar("ConfigT", bound=DataclassInstance)
@overload
def config(cls: type[ConfigT]) -> type[ConfigT]: ...
@overload
def config(
*, config: ConfigDict | None = None, **kwargs: Any
) -> Callable[[type[ConfigT]], type[ConfigT]]: ...
@dataclass_transform(field_specifiers=(PydanticField,))
def config(
cls: type[ConfigT] | None = None,
@@ -59,7 +69,7 @@ def config(
merged_config.update(config)
def decorator(cls: type[ConfigT]) -> type[ConfigT]:
return pydantic_dataclass(cls, config=merged_config, **kwargs) # type: ignore[return-value]
return dataclass(cls, config=merged_config, **kwargs) # type: ignore[return-value]
# Called with arguments: @config(config=...)
if cls is None:
+9 -14
View File
@@ -246,15 +246,15 @@ OPTIMIZATION_LEVEL_TO_CONFIG = {
}
@config(config=ConfigDict(arbitrary_types_allowed=True)) # type: ignore[arg-type,misc]
class VllmConfig: # type: ignore[misc]
@config(config=ConfigDict(arbitrary_types_allowed=True))
class VllmConfig:
"""Dataclass which contains all vllm-related configuration. This
simplifies passing around the distinct configurations in the codebase.
"""
# TODO: use default_factory once default constructing ModelConfig doesn't
# try to download a model
model_config: ModelConfig = Field(default=None) # type: ignore[assignment]
model_config: ModelConfig = None # type: ignore[assignment]
"""Model configuration."""
cache_config: CacheConfig = Field(default_factory=CacheConfig)
"""Cache configuration."""
@@ -912,7 +912,8 @@ class VllmConfig: # type: ignore[misc]
tp_size = self.parallel_config.tensor_parallel_size
hidden_size = self.model_config.get_hidden_size()
element_size = self.model_config.dtype.itemsize # type: ignore[union-attr]
assert isinstance(self.model_config.dtype, torch.dtype)
element_size = self.model_config.dtype.itemsize
pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
hidden_size, tp_size, element_size
)
@@ -1246,14 +1247,6 @@ class VllmConfig: # type: ignore[misc]
)
self.compilation_config.debug_dump_path = env_path
def has_blocked_weights(): # type: ignore[no-redef]
if self.quant_config is not None:
if hasattr(self.quant_config, "weight_block_size"):
return self.quant_config.weight_block_size is not None
elif hasattr(self.quant_config, "has_blocked_weights"):
return self.quant_config.has_blocked_weights()
return False
# Enable quant_fp8 CUDA ops (TODO disable in follow up)
# On H100 the CUDA kernel is faster than
# native implementation
@@ -1502,9 +1495,10 @@ class VllmConfig: # type: ignore[misc]
tp_size = self.parallel_config.tensor_parallel_size
max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
if max_size is not None:
assert isinstance(self.model_config.dtype, torch.dtype)
max_token_num = max_size // (
self.model_config.get_hidden_size()
* self.model_config.dtype.itemsize # type: ignore[union-attr]
* self.model_config.dtype.itemsize
)
if compile_range_end is not None and max_token_num < compile_range_end:
computed_compile_ranges_endpoints.append(max_token_num)
@@ -1527,7 +1521,8 @@ class VllmConfig: # type: ignore[misc]
tp_size = self.parallel_config.tensor_parallel_size
hidden_size = self.model_config.get_hidden_size()
element_size = self.model_config.dtype.itemsize # type: ignore[union-attr]
assert isinstance(self.model_config.dtype, torch.dtype)
element_size = self.model_config.dtype.itemsize
pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
hidden_size, tp_size, element_size
)
+1 -1
View File
@@ -1935,7 +1935,7 @@ class EngineArgs:
)
offload_config = OffloadConfig(
offload_backend=self.offload_backend, # type: ignore[arg-type]
offload_backend=self.offload_backend,
uva=UVAOffloadConfig(
cpu_offload_gb=self.cpu_offload_gb,
cpu_offload_params=self.cpu_offload_params,
+5
View File
@@ -409,6 +409,11 @@ class LLM:
# Cache for __repr__ to avoid repeated collective_rpc calls
self._cached_repr: str | None = None
@classmethod
def from_engine_args(cls, engine_args: EngineArgs) -> "LLM":
"""Create an LLM instance from EngineArgs."""
return cls(**vars(engine_args))
def get_tokenizer(self) -> TokenizerLike:
return self.llm_engine.get_tokenizer()
@@ -28,7 +28,10 @@ class ModelArchConfigConvertorBase:
self.hf_text_config = hf_text_config
def get_architectures(self) -> list[str]:
return getattr(self.hf_config, "architectures", [])
# Sometimes we get here from `vllm_config.with_hf_config(text_config)` where
# `text_config` is a sub-config from a multi-modal model. If this is the case,
# the sub-config will not have `architectures` and it will explicitly be `None`
return getattr(self.hf_config, "architectures", None) or []
def get_num_hidden_layers(self) -> int:
return getattr(self.hf_text_config, "num_hidden_layers", 0)
@@ -128,7 +131,7 @@ class ModelArchConfigConvertorBase:
hf_config: PretrainedConfig,
model_id: str,
revision: str | None,
config_format: ConfigFormat,
config_format: str | ConfigFormat,
):
# NOTE: getattr(config, "dtype", torch.float32) is not correct
# because config.dtype can be None.