[Mypy] Better fixes for the mypy issues in vllm/config (#37902)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-06-06 00:16:14 +00:00 · 2026-03-25 13:14:43 +00:00
parent 34d317dcec
commit d215d1efca
35 changed files with 153 additions and 182 deletions
@@ -42,7 +42,6 @@ details.

 import random
 import time
-from dataclasses import fields

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
@@ -124,7 +123,7 @@ def main(args):

    # Create the LLM engine
    engine_args = EngineArgs.from_cli_args(args)
-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)
    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)

    print("------warm up------")
@@ -32,7 +32,6 @@ import dataclasses
 import json
 import random
 import time
-from dataclasses import fields

 from transformers import PreTrainedTokenizerBase

@@ -197,7 +196,7 @@ def main(args):

    engine_args = EngineArgs.from_cli_args(args)

-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)

    sampling_params = SamplingParams(
        temperature=0,
@@ -6,7 +6,6 @@ import argparse
 import json
 import random
 import time
-from dataclasses import fields

 from transformers import AutoTokenizer, PreTrainedTokenizerBase

@@ -79,7 +78,7 @@ def run_vllm(
 ) -> float:
    from vllm import LLM, SamplingParams

-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)

    assert all(
        llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
@@ -9,7 +9,6 @@ on HuggingFace model repository.
 """

 import os
-from dataclasses import asdict
 from typing import Any, NamedTuple

 from huggingface_hub import snapshot_download
@@ -633,7 +632,7 @@ def main(args):
        req_data.engine_args.limit_mm_per_prompt or {}
    )

-    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    engine_args = vars(req_data.engine_args) | {"seed": args.seed}
    if args.tensor_parallel_size is not None:
        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
    llm = LLM(**engine_args)
@@ -8,7 +8,6 @@ the explicit/implicit prompt format on enc-dec LMMs for text generation.
 import os
 import time
 from collections.abc import Sequence
-from dataclasses import asdict
 from typing import NamedTuple

 from vllm import LLM, EngineArgs, PromptType, SamplingParams
@@ -91,13 +90,12 @@ def main(args):
    req_data = model_example_map[model]()

    # Disable other modalities to save memory
+    engine_args = req_data.engine_args
    default_limits = {"image": 0, "video": 0, "audio": 0}
-    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-        req_data.engine_args.limit_mm_per_prompt or {}
-    )
-
-    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
-    llm = LLM(**engine_args)
+    limit_mm_per_prompt = default_limits | (engine_args.limit_mm_per_prompt or {})
+    engine_args.limit_mm_per_prompt = limit_mm_per_prompt
+    engine_args.seed = args.seed
+    llm = LLM.from_engine_args(engine_args)

    prompts = req_data.prompts

@@ -20,8 +20,6 @@ python load_sharded_state.py \
    --max-tokens 50
 """

-import dataclasses
-
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.utils.argparse_utils import FlexibleArgumentParser

@@ -64,7 +62,7 @@ def main():
    print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")

    # Load the model using engine args
-    llm = LLM(**dataclasses.asdict(engine_args))
+    llm = LLM.from_engine_args(engine_args)

    # Prepare sampling parameters
    sampling_params = SamplingParams(
@@ -21,7 +21,6 @@ llm = LLM(
 )
 """

-import dataclasses
 import os
 import shutil
 from pathlib import Path
@@ -60,7 +59,7 @@ def main(args):
    if not Path(model_path).is_dir():
        raise ValueError("model path must be a local directory")
    # Create LLM instance from arguments
-    llm = LLM(**dataclasses.asdict(engine_args))
+    llm = LLM.from_engine_args(engine_args)
    # Prepare output directory
    Path(args.output).mkdir(exist_ok=True)
    # Dump worker states to output directory
@@ -11,7 +11,6 @@ on HuggingFace model repository.
 import os
 import random
 from contextlib import contextmanager
-from dataclasses import asdict
 from typing import NamedTuple

 from huggingface_hub import snapshot_download
@@ -2434,13 +2433,13 @@ def main(args):
        req_data.engine_args.limit_mm_per_prompt or {}
    )

-    engine_args = asdict(req_data.engine_args) | {
-        "seed": args.seed,
-        "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
-    }
+    engine_args = req_data.engine_args
+    engine_args.seed = args.seed
+    mm_processor_cache_gb = 0 if args.disable_mm_processor_cache else 4
+    engine_args.mm_processor_cache_gb = mm_processor_cache_gb
    if args.tensor_parallel_size is not None:
-        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
-    llm = LLM(**engine_args)
+        engine_args.tensor_parallel_size = args.tensor_parallel_size
+    llm = LLM.from_engine_args(engine_args)

    # Don't want to check the flag multiple times, so just hijack `prompts`.
    prompts = (
@@ -8,7 +8,6 @@ using the chat template defined by the model.

 import os
 from argparse import Namespace
-from dataclasses import asdict
 from typing import NamedTuple

 from huggingface_hub import snapshot_download
@@ -1481,10 +1480,11 @@ def run_generate(
 ):
    req_data = model_example_map[model](question, image_urls)

-    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    engine_args = req_data.engine_args
+    engine_args.seed = seed
    if tensor_parallel_size is not None:
-        engine_args["tensor_parallel_size"] = tensor_parallel_size
-    llm = LLM(**engine_args)
+        engine_args.tensor_parallel_size = tensor_parallel_size
+    llm = LLM.from_engine_args(engine_args)

    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
@@ -1521,10 +1521,11 @@ def run_chat(
        req_data.engine_args.limit_mm_per_prompt or {}
    )

-    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    engine_args = req_data.engine_args
+    engine_args.seed = seed
    if tensor_parallel_size is not None:
-        engine_args["tensor_parallel_size"] = tensor_parallel_size
-    llm = LLM(**engine_args)
+        engine_args.tensor_parallel_size = tensor_parallel_size
+    llm = LLM.from_engine_args(engine_args)

    sampling_params = (
        SamplingParams(
@@ -10,12 +10,11 @@ on HuggingFace model repository.
 """

 import argparse
-from dataclasses import asdict
 from pathlib import Path

 from PIL.Image import Image

-from vllm import LLM, EngineArgs
+from vllm import LLM
 from vllm.multimodal.utils import fetch_image
 from vllm.utils.print_utils import print_embeddings

@@ -28,14 +27,13 @@ multi_modal_data = {"image": fetch_image(image_url)}


 def run_clip(seed: int):
-    engine_args = EngineArgs(
+    llm = LLM(
        model="openai/clip-vit-base-patch32",
        runner="pooling",
        limit_mm_per_prompt={"image": 1},
+        seed=seed,
    )

-    llm = LLM(**asdict(engine_args) | {"seed": seed})
-
    print("Text embedding output:")
    outputs = llm.embed(text, use_tqdm=False)
    print_embeddings(outputs[0].outputs.embedding)
@@ -53,15 +51,14 @@ def run_clip(seed: int):


 def run_e5_v(seed: int):
-    engine_args = EngineArgs(
+    llm = LLM(
        model="royokong/e5-v",
        runner="pooling",
        max_model_len=4096,
        limit_mm_per_prompt={"image": 1},
+        seed=seed,
    )

-    llm = LLM(**asdict(engine_args) | {"seed": seed})
-
    llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501

    print("Text embedding output:")
@@ -108,20 +105,20 @@ def run_qwen3_vl(seed: int):

        multi_modal_data["image"] = post_process_image(multi_modal_data["image"])

-    engine_args = EngineArgs(
-        model="Qwen/Qwen3-VL-Embedding-2B",
-        runner="pooling",
-        max_model_len=8192,
-        limit_mm_per_prompt={"image": 1},
-        mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
-    )
    default_instruction = "Represent the user's input."
    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
    prompt_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
    prompt_image = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
    prompt_image_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"

-    llm = LLM(**asdict(engine_args) | {"seed": seed})
+    llm = LLM(
+        model="Qwen/Qwen3-VL-Embedding-2B",
+        runner="pooling",
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": 1},
+        mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
+        seed=seed,
+    )

    print("Text embedding output:")
    outputs = llm.embed(prompt_text, use_tqdm=False)
@@ -149,14 +146,13 @@ def run_qwen3_vl(seed: int):


 def run_siglip(seed: int):
-    engine_args = EngineArgs(
+    llm = LLM(
        model="google/siglip-base-patch16-224",
        runner="pooling",
        limit_mm_per_prompt={"image": 1},
+        seed=seed,
    )

-    llm = LLM(**asdict(engine_args) | {"seed": seed})
-
    print("Text embedding output:")
    outputs = llm.embed(text, use_tqdm=False)
    print_embeddings(outputs[0].outputs.embedding)
@@ -174,16 +170,15 @@ def run_siglip(seed: int):


 def run_vlm2vec_phi3v(seed: int):
-    engine_args = EngineArgs(
+    llm = LLM(
        model="TIGER-Lab/VLM2Vec-Full",
        runner="pooling",
        max_model_len=4096,
        trust_remote_code=True,
        mm_processor_kwargs={"num_crops": 4},
        limit_mm_per_prompt={"image": 1},
+        seed=seed,
    )
-
-    llm = LLM(**asdict(engine_args) | {"seed": seed})
    image_token = "<|image_1|>"

    print("Text embedding output:")
@@ -259,7 +254,7 @@ def run_vlm2vec_qwen2vl(seed: int):
    processor.save_pretrained(merged_path)
    print("Done!")

-    engine_args = EngineArgs(
+    llm = LLM(
        model=merged_path,
        runner="pooling",
        max_model_len=4096,
@@ -268,9 +263,8 @@ def run_vlm2vec_qwen2vl(seed: int):
            "max_pixels": 12845056,
        },
        limit_mm_per_prompt={"image": 1},
+        seed=seed,
    )
-
-    llm = LLM(**asdict(engine_args) | {"seed": seed})
    image_token = "<|image_pad|>"

    print("Text embedding output:")
@@ -10,7 +10,6 @@ multimodal documents (text + images/videos).

 from argparse import Namespace
 from collections.abc import Callable
-from dataclasses import asdict
 from pathlib import Path
 from typing import NamedTuple

@@ -125,7 +124,7 @@ def main(args: Namespace):
    model_request = model_example_map[args.model_name]()
    engine_args = model_request.engine_args

-    llm = LLM(**asdict(engine_args))
+    llm = LLM.from_engine_args(engine_args)

    print("Query: string & Document: string")
    outputs = llm.score(query, document)
@@ -414,9 +414,12 @@ def test_cudagraph_sizes_post_init(
        ctx,
        patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size),
    ):
+        kwargs = {}
+        if cudagraph_capture_sizes is not None:
+            kwargs["cudagraph_capture_sizes"] = cudagraph_capture_sizes
+        if max_cudagraph_capture_size is not None:
+            kwargs["max_cudagraph_capture_size"] = max_cudagraph_capture_size
        compilation_config = CompilationConfig(
-            cudagraph_capture_sizes=cudagraph_capture_sizes,
-            max_cudagraph_capture_size=max_cudagraph_capture_size,
            pass_config=PassConfig(
                enable_sp=enable_sp,
                fuse_norm_quant=True,
@@ -425,6 +428,7 @@ def test_cudagraph_sizes_post_init(
                sp_min_token_num=512 if enable_sp else None,
            ),
            cudagraph_mode=cudagraph_mode,
+            **kwargs,
        )
        engine_args = EngineArgs(
            model="facebook/opt-125m",
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for HF_HUB_OFFLINE mode"""

-import dataclasses
 import importlib
 import sys

@@ -12,7 +11,6 @@ import urllib3

 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.engine.arg_utils import EngineArgs

 MODEL_CONFIGS = [
    {
@@ -160,8 +158,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
            # Need to re-import huggingface_hub
            # and friends to set up offline mode
            _re_import_modules()
-            engine_args = EngineArgs(model="facebook/opt-125m")
-            LLM(**dataclasses.asdict(engine_args))
+            LLM(model="facebook/opt-125m")
        finally:
            # Reset the environment after the test
            # NB: Assuming tests are run in online mode
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import asdict
 from typing import NamedTuple

 import pytest
@@ -29,14 +28,6 @@ def test_keye_vl(image_assets, question: str):
    images = [asset.pil_image for asset in image_assets]
    image_urls = [encode_image_url(image) for image in images]

-    engine_args = EngineArgs(
-        model=MODEL_NAME,
-        trust_remote_code=True,
-        max_model_len=8192,
-        max_num_seqs=5,
-        limit_mm_per_prompt={"image": len(image_urls)},
-    )
-
    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
@@ -54,8 +45,14 @@ def test_keye_vl(image_assets, question: str):
        messages, tokenize=False, add_generation_prompt=True
    )

-    engine_args = asdict(engine_args) | {"seed": 42}
-    llm = LLM(**engine_args)
+    llm = LLM(
+        model=MODEL_NAME,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        seed=42,
+    )

    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=None
@@ -7,13 +7,12 @@ This test validates that each multimodal model can successfully generate outputs
 using different ViT attention backends. Tests are parametrized by model and backend.
 """

-from dataclasses import asdict
 from typing import Any

 import pytest
 from transformers import AutoProcessor

-from vllm import LLM, EngineArgs, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.multimodal.utils import encode_image_url
 from vllm.multimodal.video import sample_frames_from_video
 from vllm.platforms import current_platform
@@ -274,7 +273,7 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
    limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})

    # Create engine
-    engine_args = EngineArgs(
+    llm = LLM(
        model=config["model_name"],
        trust_remote_code=True,
        max_model_len=config["max_model_len"],
@@ -283,11 +282,9 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
        mm_encoder_attn_backend=mm_encoder_attn_backend,
        hf_overrides=dummy_hf_overrides,
        load_format="dummy",
+        seed=42,
    )

-    engine_dict = asdict(engine_args) | {"seed": 42}
-    llm = LLM(**engine_dict)
-
    # Generate
    sampling_params = SamplingParams(**config["sampling_params"])
    outputs = llm.generate(
@@ -318,7 +315,7 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
    messages = build_dots_ocr_prompt([stop_sign_image], config)

    # Create engine
-    engine_args = EngineArgs(
+    llm = LLM(
        model=config["model_name"],
        trust_remote_code=True,
        max_model_len=config["max_model_len"],
@@ -327,11 +324,9 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
        mm_encoder_attn_backend=mm_encoder_attn_backend,
        hf_overrides=dummy_hf_overrides,
        load_format="dummy",
+        seed=42,
    )

-    engine_dict = asdict(engine_args) | {"seed": 42}
-    llm = LLM(**engine_dict)
-
    # Generate using chat
    sampling_params = SamplingParams(**config["sampling_params"])
    outputs = llm.chat(messages=messages, sampling_params=sampling_params)
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
-from dataclasses import asdict

 import pytest
 import pytest_asyncio
@@ -75,7 +74,7 @@ def tokenizer() -> MistralTokenizer:
@pytest.fixture
 def engine():
    engine_args = EngineArgs(**ENGINE_CONFIG)
-    llm = LLM(**asdict(engine_args))
+    llm = LLM.from_engine_args(engine_args)
    try:
        yield llm
    finally:
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import asdict
 from typing import NamedTuple

 import pytest
 from PIL import Image

-from vllm import LLM, EngineArgs, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import AttentionConfig, KVTransferConfig
 from vllm.multimodal.utils import encode_image_url
@@ -129,24 +128,6 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend):
    # Using tmp_path as the storage path to store KV
    print(f"KV storage path at: {str(tmp_path)}")

-    # Configure the ExampleConnector
-    kv_transfer_config = KVTransferConfig(
-        kv_connector="ExampleConnector",
-        kv_role="kv_both",
-        kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
-    )
-
-    engine_args = EngineArgs(
-        model=MODEL_NAME,
-        max_model_len=8192,
-        max_num_seqs=1,
-        gpu_memory_utilization=0.4,
-        attention_config=AttentionConfig(backend=attn_backend),
-        enforce_eager=True,
-        kv_transfer_config=kv_transfer_config,
-        limit_mm_per_prompt={"image": 2},
-    )
-
    # don't put this import at the top level
    # it will call torch.accelerator.device_count()
    from transformers import AutoProcessor
@@ -163,8 +144,20 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend):
    assert image_1 != image_2, "The images should not be identical"

    # Create the LLM instance
-    engine_args = asdict(engine_args)
-    llm = LLM(**engine_args)
+    llm = LLM(
+        model=MODEL_NAME,
+        max_model_len=8192,
+        max_num_seqs=1,
+        gpu_memory_utilization=0.4,
+        attention_config=AttentionConfig(backend=attn_backend),
+        enforce_eager=True,
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="ExampleConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
+        ),
+        limit_mm_per_prompt={"image": 2},
+    )

    # Prepare the input cases
    input_cases = [
@@ -6,7 +6,6 @@ import argparse
 import json
 import os
 import time
-from dataclasses import fields
 from typing import Any

 import numpy as np
@@ -85,7 +84,7 @@ def main(args: argparse.Namespace):

    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)
    assert llm.llm_engine.model_config.max_model_len >= (
        args.input_len + args.output_len
    ), (
@@ -17,7 +17,6 @@ import argparse
 import json
 import time
 from collections import defaultdict
-from dataclasses import fields
 from datetime import datetime
 from typing import TYPE_CHECKING, Any, Literal

@@ -225,7 +224,7 @@ def benchmark_multimodal_processor(
        args.seed = 0

    engine_args = EngineArgs.from_cli_args(args)
-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)

    tokenizer = llm.get_tokenizer()
    requests = get_requests(args, tokenizer)
@@ -16,7 +16,6 @@ import shutil
 import tempfile
 import time
 from contextlib import contextmanager
-from dataclasses import fields
 from typing import Any

 import numpy as np
@@ -67,7 +66,7 @@ def run_startup_in_subprocess(engine_args, result_queue):
        # Measure total startup time
        start_time = time.perf_counter()

-        llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+        llm = LLM.from_engine_args(engine_args)

        total_startup_time = time.perf_counter() - start_time

@@ -8,7 +8,6 @@ import os
 import random
 import time
 import warnings
-from dataclasses import fields
 from typing import Any

 import torch
@@ -53,7 +52,7 @@ def run_vllm(
 ) -> tuple[float, list[RequestOutput] | None]:
    from vllm import LLM, SamplingParams

-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)
    assert all(
        llm.llm_engine.model_config.max_model_len
        >= (request.prompt_len + request.expected_output_len)
@@ -141,7 +140,7 @@ def run_vllm_chat(
    """
    from vllm import LLM, SamplingParams

-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)

    assert all(
        llm.llm_engine.model_config.max_model_len
@@ -116,29 +116,29 @@ class PassConfig:
    """

    # New flags
-    fuse_norm_quant: bool | None = Field(default=None)
+    fuse_norm_quant: bool = None  # type: ignore[assignment]
    """Fuse the custom RMSNorm + quant ops."""
-    fuse_act_quant: bool | None = Field(default=None)
+    fuse_act_quant: bool = None  # type: ignore[assignment]
    """Fuse the custom SiluMul + quant ops."""
-    fuse_attn_quant: bool | None = Field(default=None)
+    fuse_attn_quant: bool = None  # type: ignore[assignment]
    """Fuse the custom attention + quant ops."""
    eliminate_noops: bool = Field(default=True)
    """Eliminate no-op ops."""
-    enable_sp: bool | None = Field(default=None)
+    enable_sp: bool = None  # type: ignore[assignment]
    """Enable sequence parallelism. Requires TP>1. Automatically disabled
    if the model's hidden_size is too small for SP to be beneficial
    (threshold is device-capability dependent)."""
-    fuse_gemm_comms: bool | None = Field(default=None)
+    fuse_gemm_comms: bool = None  # type: ignore[assignment]
    """Enable async TP."""
-    fuse_allreduce_rms: bool | None = Field(default=None)
+    fuse_allreduce_rms: bool = None  # type: ignore[assignment]
    """Enable flashinfer allreduce fusion."""
    enable_qk_norm_rope_fusion: bool = False
    """Enable fused Q/K RMSNorm + RoPE pass."""

    # ROCm/AITER specific fusions
-    fuse_act_padding: bool | None = Field(default=None)
+    fuse_act_padding: bool = None  # type: ignore[assignment]
    """Fuse the custom RMSNorm + padding ops."""
-    fuse_rope_kvcache: bool | None = Field(default=None)
+    fuse_rope_kvcache: bool = None  # type: ignore[assignment]
    """Fuse the QK rope + KV cache ops."""

    rope_kvcache_fusion_max_token_num: int = 256
@@ -405,7 +405,7 @@ class CompilationConfig:
    """

    # Top-level Compilation control
-    mode: CompilationMode = Field(default=None)  # type: ignore[assignment]
+    mode: CompilationMode = None  # type: ignore[assignment]
    """The compilation approach used for torch.compile-based compilation of the
    model.

@@ -545,7 +545,7 @@ class CompilationConfig:
    constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""

    # CudaGraph compilation
-    cudagraph_mode: CUDAGraphMode = Field(default=None)  # type: ignore[assignment]
+    cudagraph_mode: CUDAGraphMode = None  # type: ignore[assignment]
    """
    The mode of the cudagraph:

@@ -586,7 +586,7 @@ class CompilationConfig:
    It means the first several runs will be treated as warmup runs.
    Only after that, the execution will be recorded, and the recorded
    cudagraph will be used for subsequent runs."""
-    cudagraph_capture_sizes: list[int] | None = None
+    cudagraph_capture_sizes: list[int] = None  # type: ignore[assignment]
    """Sizes to capture cudagraph.
    - None (default): capture sizes are inferred from vllm config.
    - list[int]: capture sizes are specified as given."""
@@ -607,7 +607,7 @@ class CompilationConfig:
    When `enable_lora` is False, this option has no effect.
    """

-    use_inductor_graph_partition: bool = Field(default=None)  # type: ignore[assignment]
+    use_inductor_graph_partition: bool = None  # type: ignore[assignment]
    """Use inductor graph partition to split the graph at cudagraph_unsafe ops.
    This partition happens at inductor codegen time after all passes and fusions
    are finished. It generates a single `call` function which wraps
@@ -630,7 +630,7 @@ class CompilationConfig:
    pass_config: PassConfig = field(default_factory=PassConfig)
    """Custom inductor passes, see PassConfig for more details"""

-    max_cudagraph_capture_size: int | None = field(default=None)
+    max_cudagraph_capture_size: int = None  # type: ignore[assignment]
    """The maximum cudagraph capture size.

    If cudagraph_capture_sizes is specified, this will be set to the largest
@@ -750,7 +750,7 @@ class CompilationConfig:
        return hash_factors(factors)

    def __repr__(self) -> str:
-        exclude = {
+        exclude: dict[str, bool | dict[str, bool]] = {
            "static_forward_context": True,
            "enabled_custom_ops": True,
            "disabled_custom_ops": True,
@@ -770,9 +770,7 @@ class CompilationConfig:
            exclude["pass_config"] = pass_config_exclude

        config = TypeAdapter(CompilationConfig).dump_python(
-            self,
-            exclude=exclude,  # type: ignore[arg-type]
-            exclude_unset=True,
+            self, exclude=exclude, exclude_unset=True
        )

        return str(config)
@@ -1023,7 +1021,6 @@ class CompilationConfig:
                        "Unrecognized size type in compile_sizes, "
                        f"expect 'cudagraph_capture_sizes', got {x}"
                    )
-                    assert self.cudagraph_capture_sizes is not None
                    computed_compile_sizes.extend(self.cudagraph_capture_sizes)
                else:
                    assert isinstance(x, int)
@@ -1031,7 +1028,6 @@ class CompilationConfig:
        self.compile_sizes = computed_compile_sizes  # type: ignore

        # make sure the sizes are in ascending order
-        assert self.cudagraph_capture_sizes is not None
        self.cudagraph_capture_sizes.sort()
        if self.cudagraph_capture_sizes:
            assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
@@ -1123,7 +1119,6 @@ class CompilationConfig:

    def set_splitting_ops_for_attn_fusion(self):
        assert self.pass_config.fuse_attn_quant
-        assert self.cudagraph_mode is not None
        if self.splitting_ops is None:
            self.splitting_ops = []
            if self.cudagraph_mode.has_piecewise_cudagraphs():
@@ -13,8 +13,8 @@ from vllm.utils.hashing import safe_hash
 Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]


-@config(config=ConfigDict(arbitrary_types_allowed=True))  # type: ignore[arg-type,misc]
-class DeviceConfig:  # type: ignore[misc]
+@config(config=ConfigDict(arbitrary_types_allowed=True))
+class DeviceConfig:
    """Configuration for the device to use for vLLM execution."""

    device: SkipValidation[Device | torch.device | None] = "auto"
@@ -4,7 +4,7 @@
 from collections.abc import Callable
 from typing import Any, Literal

-from pydantic import Field, field_validator
+from pydantic import field_validator

 from vllm.config.utils import config
 from vllm.utils.hashing import safe_hash
@@ -26,7 +26,7 @@ MoEBackend = Literal[
 class KernelConfig:
    """Configuration for kernel selection and warmup behavior."""

-    enable_flashinfer_autotune: bool | None = Field(default=None)
+    enable_flashinfer_autotune: bool = None  # type: ignore[assignment]
    """If True, run FlashInfer autotuning during kernel warmup."""

    moe_backend: MoEBackend = "auto"
@@ -4,8 +4,6 @@

 from typing import Literal

-from pydantic import Field
-
 from vllm.config.utils import config


@@ -18,7 +16,7 @@ class KVEventsConfig:
    Events can be published externally by zmq using the event publisher config.
    """

-    publisher: Literal["null", "zmq"] | None = Field(default=None)
+    publisher: Literal["null", "zmq"] = None  # type: ignore[assignment]
    """The publisher to use for publishing kv events. Can be "null", "zmq".
    """

@@ -25,8 +25,8 @@ MaxLoRARanks = Literal[1, 8, 16, 32, 64, 128, 256, 320, 512]
 LoRAExtraVocabSize = Literal[256, 512]


-@config(config=ConfigDict(arbitrary_types_allowed=True))  # type: ignore[arg-type,misc]
-class LoRAConfig:  # type: ignore[misc]
+@config(config=ConfigDict(arbitrary_types_allowed=True))
+class LoRAConfig:
    """Configuration for LoRA."""

    max_lora_rank: MaxLoRARanks = 16
@@ -102,8 +102,8 @@ AttnTypeStr = Literal[
 ]


-@config(config=ConfigDict(arbitrary_types_allowed=True))  # type: ignore[arg-type,misc]
-class ModelConfig:  # type: ignore[misc]
+@config(config=ConfigDict(arbitrary_types_allowed=True))
+class ModelConfig:
    """Configuration for the model."""

    model: str = "Qwen/Qwen3-0.6B"
@@ -121,7 +121,7 @@ class ModelConfig:  # type: ignore[misc]
    """Convert the model using adapters defined in
    [vllm.model_executor.models.adapters][]. The most common use case is to
    adapt a text generation model to be used for pooling tasks."""
-    tokenizer: str = Field(default=None)  # type: ignore[assignment]
+    tokenizer: str = None  # type: ignore[assignment]
    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
    name or path will be used."""
    tokenizer_mode: TokenizerMode | str = "auto"
@@ -583,7 +583,7 @@ class ModelConfig:  # type: ignore[misc]
            self.dtype,
            is_pooling_model=self.runner_type == "pooling",
            revision=self.revision,
-            config_format=self.config_format,  # type: ignore[arg-type]
+            config_format=self.config_format,
        )

        self.original_max_model_len = self.max_model_len
@@ -733,7 +733,7 @@ class ModelConfig:  # type: ignore[misc]

    @property
    def architectures(self) -> list[str]:
-        return self.model_arch_config.architectures  # type: ignore[return-value]
+        return self.model_arch_config.architectures

    @property
    def architecture(self) -> str:
@@ -1944,7 +1944,7 @@ def _get_and_verify_dtype(
    *,
    is_pooling_model: bool,
    revision: str | None = None,
-    config_format: ConfigFormat = "hf",
+    config_format: str | ConfigFormat = "hf",
 ) -> torch.dtype:
    config_dtype = ModelArchConfigConvertorBase.get_torch_dtype(
        config, model_id, revision=revision, config_format=config_format
@@ -16,7 +16,7 @@ class ModelArchitectureConfig:
    Configuration for model architecture that required by vLLM runtime
    """

-    architectures: list[str] | None
+    architectures: list[str]
    """List of model architecture class names (e.g., ['LlamaForCausalLM']).
       It can be None upon calling `vllm_config.with_hf_config(config.text_config)`"""

@@ -194,7 +194,7 @@ class ParallelConfig:
    threshold, microbatching will be used. Otherwise, the request will be
    processed in a single batch."""

-    disable_nccl_for_dp_synchronization: bool | None = Field(default=None)
+    disable_nccl_for_dp_synchronization: bool | None = None
    """Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py 
    to use Gloo instead of NCCL for its all reduce.

@@ -52,7 +52,7 @@ class SchedulerConfig:
    In real usage, this should be set in `EngineArgs.create_engine_config`.
    """

-    max_num_scheduled_tokens: int | None = Field(default=None)
+    max_num_scheduled_tokens: int | None = None
    """Maximum number of tokens that the scheduler may issue in a single iteration.
    
    This is usually equal to max_num_batched_tokens, but can be smaller in cases
@@ -122,7 +122,7 @@ class SchedulerConfig:

    # scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
    # (default) or "mod.custom_class".
-    scheduler_cls: str | type[object] | None = Field(default=None)
+    scheduler_cls: str | type[object] | None = None
    """The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
    the default scheduler. Can be a class directly or the path to a class of
    form "mod.custom_class"."""
@@ -141,7 +141,7 @@ class SchedulerConfig:
    checking the first chunk. Prevents over-admission and KV cache thrashing
    with chunked prefill."""

-    async_scheduling: bool | None = Field(default=None)
+    async_scheduling: bool | None = None
    """If set to False, disable async scheduling. Async scheduling helps to
    avoid gaps in GPU utilization, leading to better latency and throughput.
    """
@@ -11,13 +11,13 @@ import os
 import pathlib
 import textwrap
 from collections.abc import Callable, Mapping, Sequence, Set
-from dataclasses import MISSING, dataclass, field, fields, is_dataclass
+from dataclasses import MISSING, field, fields, is_dataclass
 from itertools import pairwise
-from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast
+from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast, overload

 import torch
 from pydantic import ConfigDict
-from pydantic.dataclasses import dataclass as pydantic_dataclass
+from pydantic.dataclasses import dataclass
 from pydantic.fields import Field as PydanticField
 from pydantic.fields import FieldInfo
 from typing_extensions import dataclass_transform, runtime_checkable
@@ -36,6 +36,16 @@ ConfigType = type[DataclassInstance]
 ConfigT = TypeVar("ConfigT", bound=DataclassInstance)


+@overload
+def config(cls: type[ConfigT]) -> type[ConfigT]: ...
+
+
+@overload
+def config(
+    *, config: ConfigDict | None = None, **kwargs: Any
+) -> Callable[[type[ConfigT]], type[ConfigT]]: ...
+
+
@dataclass_transform(field_specifiers=(PydanticField,))
 def config(
    cls: type[ConfigT] | None = None,
@@ -59,7 +69,7 @@ def config(
        merged_config.update(config)

    def decorator(cls: type[ConfigT]) -> type[ConfigT]:
-        return pydantic_dataclass(cls, config=merged_config, **kwargs)  # type: ignore[return-value]
+        return dataclass(cls, config=merged_config, **kwargs)  # type: ignore[return-value]

    # Called with arguments: @config(config=...)
    if cls is None:
@@ -246,15 +246,15 @@ OPTIMIZATION_LEVEL_TO_CONFIG = {
 }


-@config(config=ConfigDict(arbitrary_types_allowed=True))  # type: ignore[arg-type,misc]
-class VllmConfig:  # type: ignore[misc]
+@config(config=ConfigDict(arbitrary_types_allowed=True))
+class VllmConfig:
    """Dataclass which contains all vllm-related configuration. This
    simplifies passing around the distinct configurations in the codebase.
    """

    # TODO: use default_factory once default constructing ModelConfig doesn't
    # try to download a model
-    model_config: ModelConfig = Field(default=None)  # type: ignore[assignment]
+    model_config: ModelConfig = None  # type: ignore[assignment]
    """Model configuration."""
    cache_config: CacheConfig = Field(default_factory=CacheConfig)
    """Cache configuration."""
@@ -912,7 +912,8 @@ class VllmConfig:  # type: ignore[misc]

                    tp_size = self.parallel_config.tensor_parallel_size
                    hidden_size = self.model_config.get_hidden_size()
-                    element_size = self.model_config.dtype.itemsize  # type: ignore[union-attr]
+                    assert isinstance(self.model_config.dtype, torch.dtype)
+                    element_size = self.model_config.dtype.itemsize
                    pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
                        hidden_size, tp_size, element_size
                    )
@@ -1246,14 +1247,6 @@ class VllmConfig:  # type: ignore[misc]
                )
            self.compilation_config.debug_dump_path = env_path

-        def has_blocked_weights():  # type: ignore[no-redef]
-            if self.quant_config is not None:
-                if hasattr(self.quant_config, "weight_block_size"):
-                    return self.quant_config.weight_block_size is not None
-                elif hasattr(self.quant_config, "has_blocked_weights"):
-                    return self.quant_config.has_blocked_weights()
-            return False
-
        # Enable quant_fp8 CUDA ops (TODO disable in follow up)
        # On H100 the CUDA kernel is faster than
        # native implementation
@@ -1502,9 +1495,10 @@ class VllmConfig:  # type: ignore[misc]
            tp_size = self.parallel_config.tensor_parallel_size
            max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
            if max_size is not None:
+                assert isinstance(self.model_config.dtype, torch.dtype)
                max_token_num = max_size // (
                    self.model_config.get_hidden_size()
-                    * self.model_config.dtype.itemsize  # type: ignore[union-attr]
+                    * self.model_config.dtype.itemsize
                )
                if compile_range_end is not None and max_token_num < compile_range_end:
                    computed_compile_ranges_endpoints.append(max_token_num)
@@ -1527,7 +1521,8 @@ class VllmConfig:  # type: ignore[misc]

                tp_size = self.parallel_config.tensor_parallel_size
                hidden_size = self.model_config.get_hidden_size()
-                element_size = self.model_config.dtype.itemsize  # type: ignore[union-attr]
+                assert isinstance(self.model_config.dtype, torch.dtype)
+                element_size = self.model_config.dtype.itemsize
                pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
                    hidden_size, tp_size, element_size
                )
@@ -1935,7 +1935,7 @@ class EngineArgs:
            )

        offload_config = OffloadConfig(
-            offload_backend=self.offload_backend,  # type: ignore[arg-type]
+            offload_backend=self.offload_backend,
            uva=UVAOffloadConfig(
                cpu_offload_gb=self.cpu_offload_gb,
                cpu_offload_params=self.cpu_offload_params,
@@ -409,6 +409,11 @@ class LLM:
        # Cache for __repr__ to avoid repeated collective_rpc calls
        self._cached_repr: str | None = None

+    @classmethod
+    def from_engine_args(cls, engine_args: EngineArgs) -> "LLM":
+        """Create an LLM instance from EngineArgs."""
+        return cls(**vars(engine_args))
+
    def get_tokenizer(self) -> TokenizerLike:
        return self.llm_engine.get_tokenizer()

@@ -28,7 +28,10 @@ class ModelArchConfigConvertorBase:
        self.hf_text_config = hf_text_config

    def get_architectures(self) -> list[str]:
-        return getattr(self.hf_config, "architectures", [])
+        # Sometimes we get here from `vllm_config.with_hf_config(text_config)` where
+        # `text_config` is a sub-config from a multi-modal model. If this is the case,
+        # the sub-config will not have `architectures` and it will explicitly be `None`
+        return getattr(self.hf_config, "architectures", None) or []

    def get_num_hidden_layers(self) -> int:
        return getattr(self.hf_text_config, "num_hidden_layers", 0)
@@ -128,7 +131,7 @@ class ModelArchConfigConvertorBase:
        hf_config: PretrainedConfig,
        model_id: str,
        revision: str | None,
-        config_format: ConfigFormat,
+        config_format: str | ConfigFormat,
    ):
        # NOTE: getattr(config, "dtype", torch.float32) is not correct
        # because config.dtype can be None.