mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Mypy] Better fixes for the mypy issues in vllm/config (#37902)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -42,7 +42,6 @@ details.
|
||||
|
||||
import random
|
||||
import time
|
||||
from dataclasses import fields
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
@@ -124,7 +123,7 @@ def main(args):
|
||||
|
||||
# Create the LLM engine
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||
|
||||
print("------warm up------")
|
||||
|
||||
@@ -32,7 +32,6 @@ import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from dataclasses import fields
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
@@ -197,7 +196,7 @@ def main(args):
|
||||
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
|
||||
@@ -6,7 +6,6 @@ import argparse
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from dataclasses import fields
|
||||
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
@@ -79,7 +78,7 @@ def run_vllm(
|
||||
) -> float:
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
|
||||
assert all(
|
||||
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
|
||||
|
||||
@@ -9,7 +9,6 @@ on HuggingFace model repository.
|
||||
"""
|
||||
|
||||
import os
|
||||
from dataclasses import asdict
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
@@ -633,7 +632,7 @@ def main(args):
|
||||
req_data.engine_args.limit_mm_per_prompt or {}
|
||||
)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
engine_args = vars(req_data.engine_args) | {"seed": args.seed}
|
||||
if args.tensor_parallel_size is not None:
|
||||
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
@@ -8,7 +8,6 @@ the explicit/implicit prompt format on enc-dec LMMs for text generation.
|
||||
import os
|
||||
import time
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple
|
||||
|
||||
from vllm import LLM, EngineArgs, PromptType, SamplingParams
|
||||
@@ -91,13 +90,12 @@ def main(args):
|
||||
req_data = model_example_map[model]()
|
||||
|
||||
# Disable other modalities to save memory
|
||||
engine_args = req_data.engine_args
|
||||
default_limits = {"image": 0, "video": 0, "audio": 0}
|
||||
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
|
||||
req_data.engine_args.limit_mm_per_prompt or {}
|
||||
)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
llm = LLM(**engine_args)
|
||||
limit_mm_per_prompt = default_limits | (engine_args.limit_mm_per_prompt or {})
|
||||
engine_args.limit_mm_per_prompt = limit_mm_per_prompt
|
||||
engine_args.seed = args.seed
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
|
||||
prompts = req_data.prompts
|
||||
|
||||
|
||||
@@ -20,8 +20,6 @@ python load_sharded_state.py \
|
||||
--max-tokens 50
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
@@ -64,7 +62,7 @@ def main():
|
||||
print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
|
||||
|
||||
# Load the model using engine args
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
|
||||
# Prepare sampling parameters
|
||||
sampling_params = SamplingParams(
|
||||
|
||||
@@ -21,7 +21,6 @@ llm = LLM(
|
||||
)
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
@@ -60,7 +59,7 @@ def main(args):
|
||||
if not Path(model_path).is_dir():
|
||||
raise ValueError("model path must be a local directory")
|
||||
# Create LLM instance from arguments
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
# Prepare output directory
|
||||
Path(args.output).mkdir(exist_ok=True)
|
||||
# Dump worker states to output directory
|
||||
|
||||
@@ -11,7 +11,6 @@ on HuggingFace model repository.
|
||||
import os
|
||||
import random
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
@@ -2434,13 +2433,13 @@ def main(args):
|
||||
req_data.engine_args.limit_mm_per_prompt or {}
|
||||
)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {
|
||||
"seed": args.seed,
|
||||
"mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
|
||||
}
|
||||
engine_args = req_data.engine_args
|
||||
engine_args.seed = args.seed
|
||||
mm_processor_cache_gb = 0 if args.disable_mm_processor_cache else 4
|
||||
engine_args.mm_processor_cache_gb = mm_processor_cache_gb
|
||||
if args.tensor_parallel_size is not None:
|
||||
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
|
||||
llm = LLM(**engine_args)
|
||||
engine_args.tensor_parallel_size = args.tensor_parallel_size
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
|
||||
# Don't want to check the flag multiple times, so just hijack `prompts`.
|
||||
prompts = (
|
||||
|
||||
@@ -8,7 +8,6 @@ using the chat template defined by the model.
|
||||
|
||||
import os
|
||||
from argparse import Namespace
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
@@ -1481,10 +1480,11 @@ def run_generate(
|
||||
):
|
||||
req_data = model_example_map[model](question, image_urls)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
engine_args = req_data.engine_args
|
||||
engine_args.seed = seed
|
||||
if tensor_parallel_size is not None:
|
||||
engine_args["tensor_parallel_size"] = tensor_parallel_size
|
||||
llm = LLM(**engine_args)
|
||||
engine_args.tensor_parallel_size = tensor_parallel_size
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
|
||||
@@ -1521,10 +1521,11 @@ def run_chat(
|
||||
req_data.engine_args.limit_mm_per_prompt or {}
|
||||
)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
engine_args = req_data.engine_args
|
||||
engine_args.seed = seed
|
||||
if tensor_parallel_size is not None:
|
||||
engine_args["tensor_parallel_size"] = tensor_parallel_size
|
||||
llm = LLM(**engine_args)
|
||||
engine_args.tensor_parallel_size = tensor_parallel_size
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
|
||||
sampling_params = (
|
||||
SamplingParams(
|
||||
|
||||
@@ -10,12 +10,11 @@ on HuggingFace model repository.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
|
||||
from PIL.Image import Image
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm import LLM
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils.print_utils import print_embeddings
|
||||
|
||||
@@ -28,14 +27,13 @@ multi_modal_data = {"image": fetch_image(image_url)}
|
||||
|
||||
|
||||
def run_clip(seed: int):
|
||||
engine_args = EngineArgs(
|
||||
llm = LLM(
|
||||
model="openai/clip-vit-base-patch32",
|
||||
runner="pooling",
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
llm = LLM(**asdict(engine_args) | {"seed": seed})
|
||||
|
||||
print("Text embedding output:")
|
||||
outputs = llm.embed(text, use_tqdm=False)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
@@ -53,15 +51,14 @@ def run_clip(seed: int):
|
||||
|
||||
|
||||
def run_e5_v(seed: int):
|
||||
engine_args = EngineArgs(
|
||||
llm = LLM(
|
||||
model="royokong/e5-v",
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
llm = LLM(**asdict(engine_args) | {"seed": seed})
|
||||
|
||||
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
|
||||
|
||||
print("Text embedding output:")
|
||||
@@ -108,20 +105,20 @@ def run_qwen3_vl(seed: int):
|
||||
|
||||
multi_modal_data["image"] = post_process_image(multi_modal_data["image"])
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="Qwen/Qwen3-VL-Embedding-2B",
|
||||
runner="pooling",
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
|
||||
)
|
||||
default_instruction = "Represent the user's input."
|
||||
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
prompt_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
|
||||
prompt_image = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
|
||||
prompt_image_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
llm = LLM(**asdict(engine_args) | {"seed": seed})
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen3-VL-Embedding-2B",
|
||||
runner="pooling",
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
print("Text embedding output:")
|
||||
outputs = llm.embed(prompt_text, use_tqdm=False)
|
||||
@@ -149,14 +146,13 @@ def run_qwen3_vl(seed: int):
|
||||
|
||||
|
||||
def run_siglip(seed: int):
|
||||
engine_args = EngineArgs(
|
||||
llm = LLM(
|
||||
model="google/siglip-base-patch16-224",
|
||||
runner="pooling",
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
llm = LLM(**asdict(engine_args) | {"seed": seed})
|
||||
|
||||
print("Text embedding output:")
|
||||
outputs = llm.embed(text, use_tqdm=False)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
@@ -174,16 +170,15 @@ def run_siglip(seed: int):
|
||||
|
||||
|
||||
def run_vlm2vec_phi3v(seed: int):
|
||||
engine_args = EngineArgs(
|
||||
llm = LLM(
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={"num_crops": 4},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
llm = LLM(**asdict(engine_args) | {"seed": seed})
|
||||
image_token = "<|image_1|>"
|
||||
|
||||
print("Text embedding output:")
|
||||
@@ -259,7 +254,7 @@ def run_vlm2vec_qwen2vl(seed: int):
|
||||
processor.save_pretrained(merged_path)
|
||||
print("Done!")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
llm = LLM(
|
||||
model=merged_path,
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
@@ -268,9 +263,8 @@ def run_vlm2vec_qwen2vl(seed: int):
|
||||
"max_pixels": 12845056,
|
||||
},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
llm = LLM(**asdict(engine_args) | {"seed": seed})
|
||||
image_token = "<|image_pad|>"
|
||||
|
||||
print("Text embedding output:")
|
||||
|
||||
@@ -10,7 +10,6 @@ multimodal documents (text + images/videos).
|
||||
|
||||
from argparse import Namespace
|
||||
from collections.abc import Callable
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import NamedTuple
|
||||
|
||||
@@ -125,7 +124,7 @@ def main(args: Namespace):
|
||||
model_request = model_example_map[args.model_name]()
|
||||
engine_args = model_request.engine_args
|
||||
|
||||
llm = LLM(**asdict(engine_args))
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
|
||||
print("Query: string & Document: string")
|
||||
outputs = llm.score(query, document)
|
||||
|
||||
@@ -414,9 +414,12 @@ def test_cudagraph_sizes_post_init(
|
||||
ctx,
|
||||
patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size),
|
||||
):
|
||||
kwargs = {}
|
||||
if cudagraph_capture_sizes is not None:
|
||||
kwargs["cudagraph_capture_sizes"] = cudagraph_capture_sizes
|
||||
if max_cudagraph_capture_size is not None:
|
||||
kwargs["max_cudagraph_capture_size"] = max_cudagraph_capture_size
|
||||
compilation_config = CompilationConfig(
|
||||
cudagraph_capture_sizes=cudagraph_capture_sizes,
|
||||
max_cudagraph_capture_size=max_cudagraph_capture_size,
|
||||
pass_config=PassConfig(
|
||||
enable_sp=enable_sp,
|
||||
fuse_norm_quant=True,
|
||||
@@ -425,6 +428,7 @@ def test_cudagraph_sizes_post_init(
|
||||
sp_min_token_num=512 if enable_sp else None,
|
||||
),
|
||||
cudagraph_mode=cudagraph_mode,
|
||||
**kwargs,
|
||||
)
|
||||
engine_args = EngineArgs(
|
||||
model="facebook/opt-125m",
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for HF_HUB_OFFLINE mode"""
|
||||
|
||||
import dataclasses
|
||||
import importlib
|
||||
import sys
|
||||
|
||||
@@ -12,7 +11,6 @@ import urllib3
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
|
||||
MODEL_CONFIGS = [
|
||||
{
|
||||
@@ -160,8 +158,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
|
||||
# Need to re-import huggingface_hub
|
||||
# and friends to set up offline mode
|
||||
_re_import_modules()
|
||||
engine_args = EngineArgs(model="facebook/opt-125m")
|
||||
LLM(**dataclasses.asdict(engine_args))
|
||||
LLM(model="facebook/opt-125m")
|
||||
finally:
|
||||
# Reset the environment after the test
|
||||
# NB: Assuming tests are run in online mode
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple
|
||||
|
||||
import pytest
|
||||
@@ -29,14 +28,6 @@ def test_keye_vl(image_assets, question: str):
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
image_urls = [encode_image_url(image) for image in images]
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
@@ -54,8 +45,14 @@ def test_keye_vl(image_assets, question: str):
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
engine_args = asdict(engine_args) | {"seed": 42}
|
||||
llm = LLM(**engine_args)
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
seed=42,
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0, max_tokens=256, stop_token_ids=None
|
||||
|
||||
@@ -7,13 +7,12 @@ This test validates that each multimodal model can successfully generate outputs
|
||||
using different ViT attention backends. Tests are parametrized by model and backend.
|
||||
"""
|
||||
|
||||
from dataclasses import asdict
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.multimodal.utils import encode_image_url
|
||||
from vllm.multimodal.video import sample_frames_from_video
|
||||
from vllm.platforms import current_platform
|
||||
@@ -274,7 +273,7 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
|
||||
limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})
|
||||
|
||||
# Create engine
|
||||
engine_args = EngineArgs(
|
||||
llm = LLM(
|
||||
model=config["model_name"],
|
||||
trust_remote_code=True,
|
||||
max_model_len=config["max_model_len"],
|
||||
@@ -283,11 +282,9 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
|
||||
mm_encoder_attn_backend=mm_encoder_attn_backend,
|
||||
hf_overrides=dummy_hf_overrides,
|
||||
load_format="dummy",
|
||||
seed=42,
|
||||
)
|
||||
|
||||
engine_dict = asdict(engine_args) | {"seed": 42}
|
||||
llm = LLM(**engine_dict)
|
||||
|
||||
# Generate
|
||||
sampling_params = SamplingParams(**config["sampling_params"])
|
||||
outputs = llm.generate(
|
||||
@@ -318,7 +315,7 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
|
||||
messages = build_dots_ocr_prompt([stop_sign_image], config)
|
||||
|
||||
# Create engine
|
||||
engine_args = EngineArgs(
|
||||
llm = LLM(
|
||||
model=config["model_name"],
|
||||
trust_remote_code=True,
|
||||
max_model_len=config["max_model_len"],
|
||||
@@ -327,11 +324,9 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
|
||||
mm_encoder_attn_backend=mm_encoder_attn_backend,
|
||||
hf_overrides=dummy_hf_overrides,
|
||||
load_format="dummy",
|
||||
seed=42,
|
||||
)
|
||||
|
||||
engine_dict = asdict(engine_args) | {"seed": 42}
|
||||
llm = LLM(**engine_dict)
|
||||
|
||||
# Generate using chat
|
||||
sampling_params = SamplingParams(**config["sampling_params"])
|
||||
outputs = llm.chat(messages=messages, sampling_params=sampling_params)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import contextlib
|
||||
from dataclasses import asdict
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
@@ -75,7 +74,7 @@ def tokenizer() -> MistralTokenizer:
|
||||
@pytest.fixture
|
||||
def engine():
|
||||
engine_args = EngineArgs(**ENGINE_CONFIG)
|
||||
llm = LLM(**asdict(engine_args))
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
try:
|
||||
yield llm
|
||||
finally:
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.config import AttentionConfig, KVTransferConfig
|
||||
from vllm.multimodal.utils import encode_image_url
|
||||
@@ -129,24 +128,6 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend):
|
||||
# Using tmp_path as the storage path to store KV
|
||||
print(f"KV storage path at: {str(tmp_path)}")
|
||||
|
||||
# Configure the ExampleConnector
|
||||
kv_transfer_config = KVTransferConfig(
|
||||
kv_connector="ExampleConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
|
||||
)
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=1,
|
||||
gpu_memory_utilization=0.4,
|
||||
attention_config=AttentionConfig(backend=attn_backend),
|
||||
enforce_eager=True,
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
)
|
||||
|
||||
# don't put this import at the top level
|
||||
# it will call torch.accelerator.device_count()
|
||||
from transformers import AutoProcessor
|
||||
@@ -163,8 +144,20 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend):
|
||||
assert image_1 != image_2, "The images should not be identical"
|
||||
|
||||
# Create the LLM instance
|
||||
engine_args = asdict(engine_args)
|
||||
llm = LLM(**engine_args)
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=1,
|
||||
gpu_memory_utilization=0.4,
|
||||
attention_config=AttentionConfig(backend=attn_backend),
|
||||
enforce_eager=True,
|
||||
kv_transfer_config=KVTransferConfig(
|
||||
kv_connector="ExampleConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
|
||||
),
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
)
|
||||
|
||||
# Prepare the input cases
|
||||
input_cases = [
|
||||
|
||||
@@ -6,7 +6,6 @@ import argparse
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from dataclasses import fields
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
@@ -85,7 +84,7 @@ def main(args: argparse.Namespace):
|
||||
|
||||
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
||||
# the engine will automatically process the request in multiple batches.
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
assert llm.llm_engine.model_config.max_model_len >= (
|
||||
args.input_len + args.output_len
|
||||
), (
|
||||
|
||||
@@ -17,7 +17,6 @@ import argparse
|
||||
import json
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import fields
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
@@ -225,7 +224,7 @@ def benchmark_multimodal_processor(
|
||||
args.seed = 0
|
||||
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
|
||||
tokenizer = llm.get_tokenizer()
|
||||
requests = get_requests(args, tokenizer)
|
||||
|
||||
@@ -16,7 +16,6 @@ import shutil
|
||||
import tempfile
|
||||
import time
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import fields
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
@@ -67,7 +66,7 @@ def run_startup_in_subprocess(engine_args, result_queue):
|
||||
# Measure total startup time
|
||||
start_time = time.perf_counter()
|
||||
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
|
||||
total_startup_time = time.perf_counter() - start_time
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ import os
|
||||
import random
|
||||
import time
|
||||
import warnings
|
||||
from dataclasses import fields
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
@@ -53,7 +52,7 @@ def run_vllm(
|
||||
) -> tuple[float, list[RequestOutput] | None]:
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
assert all(
|
||||
llm.llm_engine.model_config.max_model_len
|
||||
>= (request.prompt_len + request.expected_output_len)
|
||||
@@ -141,7 +140,7 @@ def run_vllm_chat(
|
||||
"""
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
llm = LLM.from_engine_args(engine_args)
|
||||
|
||||
assert all(
|
||||
llm.llm_engine.model_config.max_model_len
|
||||
|
||||
+15
-20
@@ -116,29 +116,29 @@ class PassConfig:
|
||||
"""
|
||||
|
||||
# New flags
|
||||
fuse_norm_quant: bool | None = Field(default=None)
|
||||
fuse_norm_quant: bool = None # type: ignore[assignment]
|
||||
"""Fuse the custom RMSNorm + quant ops."""
|
||||
fuse_act_quant: bool | None = Field(default=None)
|
||||
fuse_act_quant: bool = None # type: ignore[assignment]
|
||||
"""Fuse the custom SiluMul + quant ops."""
|
||||
fuse_attn_quant: bool | None = Field(default=None)
|
||||
fuse_attn_quant: bool = None # type: ignore[assignment]
|
||||
"""Fuse the custom attention + quant ops."""
|
||||
eliminate_noops: bool = Field(default=True)
|
||||
"""Eliminate no-op ops."""
|
||||
enable_sp: bool | None = Field(default=None)
|
||||
enable_sp: bool = None # type: ignore[assignment]
|
||||
"""Enable sequence parallelism. Requires TP>1. Automatically disabled
|
||||
if the model's hidden_size is too small for SP to be beneficial
|
||||
(threshold is device-capability dependent)."""
|
||||
fuse_gemm_comms: bool | None = Field(default=None)
|
||||
fuse_gemm_comms: bool = None # type: ignore[assignment]
|
||||
"""Enable async TP."""
|
||||
fuse_allreduce_rms: bool | None = Field(default=None)
|
||||
fuse_allreduce_rms: bool = None # type: ignore[assignment]
|
||||
"""Enable flashinfer allreduce fusion."""
|
||||
enable_qk_norm_rope_fusion: bool = False
|
||||
"""Enable fused Q/K RMSNorm + RoPE pass."""
|
||||
|
||||
# ROCm/AITER specific fusions
|
||||
fuse_act_padding: bool | None = Field(default=None)
|
||||
fuse_act_padding: bool = None # type: ignore[assignment]
|
||||
"""Fuse the custom RMSNorm + padding ops."""
|
||||
fuse_rope_kvcache: bool | None = Field(default=None)
|
||||
fuse_rope_kvcache: bool = None # type: ignore[assignment]
|
||||
"""Fuse the QK rope + KV cache ops."""
|
||||
|
||||
rope_kvcache_fusion_max_token_num: int = 256
|
||||
@@ -405,7 +405,7 @@ class CompilationConfig:
|
||||
"""
|
||||
|
||||
# Top-level Compilation control
|
||||
mode: CompilationMode = Field(default=None) # type: ignore[assignment]
|
||||
mode: CompilationMode = None # type: ignore[assignment]
|
||||
"""The compilation approach used for torch.compile-based compilation of the
|
||||
model.
|
||||
|
||||
@@ -545,7 +545,7 @@ class CompilationConfig:
|
||||
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
|
||||
|
||||
# CudaGraph compilation
|
||||
cudagraph_mode: CUDAGraphMode = Field(default=None) # type: ignore[assignment]
|
||||
cudagraph_mode: CUDAGraphMode = None # type: ignore[assignment]
|
||||
"""
|
||||
The mode of the cudagraph:
|
||||
|
||||
@@ -586,7 +586,7 @@ class CompilationConfig:
|
||||
It means the first several runs will be treated as warmup runs.
|
||||
Only after that, the execution will be recorded, and the recorded
|
||||
cudagraph will be used for subsequent runs."""
|
||||
cudagraph_capture_sizes: list[int] | None = None
|
||||
cudagraph_capture_sizes: list[int] = None # type: ignore[assignment]
|
||||
"""Sizes to capture cudagraph.
|
||||
- None (default): capture sizes are inferred from vllm config.
|
||||
- list[int]: capture sizes are specified as given."""
|
||||
@@ -607,7 +607,7 @@ class CompilationConfig:
|
||||
When `enable_lora` is False, this option has no effect.
|
||||
"""
|
||||
|
||||
use_inductor_graph_partition: bool = Field(default=None) # type: ignore[assignment]
|
||||
use_inductor_graph_partition: bool = None # type: ignore[assignment]
|
||||
"""Use inductor graph partition to split the graph at cudagraph_unsafe ops.
|
||||
This partition happens at inductor codegen time after all passes and fusions
|
||||
are finished. It generates a single `call` function which wraps
|
||||
@@ -630,7 +630,7 @@ class CompilationConfig:
|
||||
pass_config: PassConfig = field(default_factory=PassConfig)
|
||||
"""Custom inductor passes, see PassConfig for more details"""
|
||||
|
||||
max_cudagraph_capture_size: int | None = field(default=None)
|
||||
max_cudagraph_capture_size: int = None # type: ignore[assignment]
|
||||
"""The maximum cudagraph capture size.
|
||||
|
||||
If cudagraph_capture_sizes is specified, this will be set to the largest
|
||||
@@ -750,7 +750,7 @@ class CompilationConfig:
|
||||
return hash_factors(factors)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
exclude = {
|
||||
exclude: dict[str, bool | dict[str, bool]] = {
|
||||
"static_forward_context": True,
|
||||
"enabled_custom_ops": True,
|
||||
"disabled_custom_ops": True,
|
||||
@@ -770,9 +770,7 @@ class CompilationConfig:
|
||||
exclude["pass_config"] = pass_config_exclude
|
||||
|
||||
config = TypeAdapter(CompilationConfig).dump_python(
|
||||
self,
|
||||
exclude=exclude, # type: ignore[arg-type]
|
||||
exclude_unset=True,
|
||||
self, exclude=exclude, exclude_unset=True
|
||||
)
|
||||
|
||||
return str(config)
|
||||
@@ -1023,7 +1021,6 @@ class CompilationConfig:
|
||||
"Unrecognized size type in compile_sizes, "
|
||||
f"expect 'cudagraph_capture_sizes', got {x}"
|
||||
)
|
||||
assert self.cudagraph_capture_sizes is not None
|
||||
computed_compile_sizes.extend(self.cudagraph_capture_sizes)
|
||||
else:
|
||||
assert isinstance(x, int)
|
||||
@@ -1031,7 +1028,6 @@ class CompilationConfig:
|
||||
self.compile_sizes = computed_compile_sizes # type: ignore
|
||||
|
||||
# make sure the sizes are in ascending order
|
||||
assert self.cudagraph_capture_sizes is not None
|
||||
self.cudagraph_capture_sizes.sort()
|
||||
if self.cudagraph_capture_sizes:
|
||||
assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
|
||||
@@ -1123,7 +1119,6 @@ class CompilationConfig:
|
||||
|
||||
def set_splitting_ops_for_attn_fusion(self):
|
||||
assert self.pass_config.fuse_attn_quant
|
||||
assert self.cudagraph_mode is not None
|
||||
if self.splitting_ops is None:
|
||||
self.splitting_ops = []
|
||||
if self.cudagraph_mode.has_piecewise_cudagraphs():
|
||||
|
||||
@@ -13,8 +13,8 @@ from vllm.utils.hashing import safe_hash
|
||||
Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
|
||||
|
||||
|
||||
@config(config=ConfigDict(arbitrary_types_allowed=True)) # type: ignore[arg-type,misc]
|
||||
class DeviceConfig: # type: ignore[misc]
|
||||
@config(config=ConfigDict(arbitrary_types_allowed=True))
|
||||
class DeviceConfig:
|
||||
"""Configuration for the device to use for vLLM execution."""
|
||||
|
||||
device: SkipValidation[Device | torch.device | None] = "auto"
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
from collections.abc import Callable
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import Field, field_validator
|
||||
from pydantic import field_validator
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.utils.hashing import safe_hash
|
||||
@@ -26,7 +26,7 @@ MoEBackend = Literal[
|
||||
class KernelConfig:
|
||||
"""Configuration for kernel selection and warmup behavior."""
|
||||
|
||||
enable_flashinfer_autotune: bool | None = Field(default=None)
|
||||
enable_flashinfer_autotune: bool = None # type: ignore[assignment]
|
||||
"""If True, run FlashInfer autotuning during kernel warmup."""
|
||||
|
||||
moe_backend: MoEBackend = "auto"
|
||||
|
||||
@@ -4,8 +4,6 @@
|
||||
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from vllm.config.utils import config
|
||||
|
||||
|
||||
@@ -18,7 +16,7 @@ class KVEventsConfig:
|
||||
Events can be published externally by zmq using the event publisher config.
|
||||
"""
|
||||
|
||||
publisher: Literal["null", "zmq"] | None = Field(default=None)
|
||||
publisher: Literal["null", "zmq"] = None # type: ignore[assignment]
|
||||
"""The publisher to use for publishing kv events. Can be "null", "zmq".
|
||||
"""
|
||||
|
||||
|
||||
+2
-2
@@ -25,8 +25,8 @@ MaxLoRARanks = Literal[1, 8, 16, 32, 64, 128, 256, 320, 512]
|
||||
LoRAExtraVocabSize = Literal[256, 512]
|
||||
|
||||
|
||||
@config(config=ConfigDict(arbitrary_types_allowed=True)) # type: ignore[arg-type,misc]
|
||||
class LoRAConfig: # type: ignore[misc]
|
||||
@config(config=ConfigDict(arbitrary_types_allowed=True))
|
||||
class LoRAConfig:
|
||||
"""Configuration for LoRA."""
|
||||
|
||||
max_lora_rank: MaxLoRARanks = 16
|
||||
|
||||
@@ -102,8 +102,8 @@ AttnTypeStr = Literal[
|
||||
]
|
||||
|
||||
|
||||
@config(config=ConfigDict(arbitrary_types_allowed=True)) # type: ignore[arg-type,misc]
|
||||
class ModelConfig: # type: ignore[misc]
|
||||
@config(config=ConfigDict(arbitrary_types_allowed=True))
|
||||
class ModelConfig:
|
||||
"""Configuration for the model."""
|
||||
|
||||
model: str = "Qwen/Qwen3-0.6B"
|
||||
@@ -121,7 +121,7 @@ class ModelConfig: # type: ignore[misc]
|
||||
"""Convert the model using adapters defined in
|
||||
[vllm.model_executor.models.adapters][]. The most common use case is to
|
||||
adapt a text generation model to be used for pooling tasks."""
|
||||
tokenizer: str = Field(default=None) # type: ignore[assignment]
|
||||
tokenizer: str = None # type: ignore[assignment]
|
||||
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
|
||||
name or path will be used."""
|
||||
tokenizer_mode: TokenizerMode | str = "auto"
|
||||
@@ -583,7 +583,7 @@ class ModelConfig: # type: ignore[misc]
|
||||
self.dtype,
|
||||
is_pooling_model=self.runner_type == "pooling",
|
||||
revision=self.revision,
|
||||
config_format=self.config_format, # type: ignore[arg-type]
|
||||
config_format=self.config_format,
|
||||
)
|
||||
|
||||
self.original_max_model_len = self.max_model_len
|
||||
@@ -733,7 +733,7 @@ class ModelConfig: # type: ignore[misc]
|
||||
|
||||
@property
|
||||
def architectures(self) -> list[str]:
|
||||
return self.model_arch_config.architectures # type: ignore[return-value]
|
||||
return self.model_arch_config.architectures
|
||||
|
||||
@property
|
||||
def architecture(self) -> str:
|
||||
@@ -1944,7 +1944,7 @@ def _get_and_verify_dtype(
|
||||
*,
|
||||
is_pooling_model: bool,
|
||||
revision: str | None = None,
|
||||
config_format: ConfigFormat = "hf",
|
||||
config_format: str | ConfigFormat = "hf",
|
||||
) -> torch.dtype:
|
||||
config_dtype = ModelArchConfigConvertorBase.get_torch_dtype(
|
||||
config, model_id, revision=revision, config_format=config_format
|
||||
|
||||
@@ -16,7 +16,7 @@ class ModelArchitectureConfig:
|
||||
Configuration for model architecture that required by vLLM runtime
|
||||
"""
|
||||
|
||||
architectures: list[str] | None
|
||||
architectures: list[str]
|
||||
"""List of model architecture class names (e.g., ['LlamaForCausalLM']).
|
||||
It can be None upon calling `vllm_config.with_hf_config(config.text_config)`"""
|
||||
|
||||
|
||||
@@ -194,7 +194,7 @@ class ParallelConfig:
|
||||
threshold, microbatching will be used. Otherwise, the request will be
|
||||
processed in a single batch."""
|
||||
|
||||
disable_nccl_for_dp_synchronization: bool | None = Field(default=None)
|
||||
disable_nccl_for_dp_synchronization: bool | None = None
|
||||
"""Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py
|
||||
to use Gloo instead of NCCL for its all reduce.
|
||||
|
||||
|
||||
@@ -52,7 +52,7 @@ class SchedulerConfig:
|
||||
In real usage, this should be set in `EngineArgs.create_engine_config`.
|
||||
"""
|
||||
|
||||
max_num_scheduled_tokens: int | None = Field(default=None)
|
||||
max_num_scheduled_tokens: int | None = None
|
||||
"""Maximum number of tokens that the scheduler may issue in a single iteration.
|
||||
|
||||
This is usually equal to max_num_batched_tokens, but can be smaller in cases
|
||||
@@ -122,7 +122,7 @@ class SchedulerConfig:
|
||||
|
||||
# scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
|
||||
# (default) or "mod.custom_class".
|
||||
scheduler_cls: str | type[object] | None = Field(default=None)
|
||||
scheduler_cls: str | type[object] | None = None
|
||||
"""The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
|
||||
the default scheduler. Can be a class directly or the path to a class of
|
||||
form "mod.custom_class"."""
|
||||
@@ -141,7 +141,7 @@ class SchedulerConfig:
|
||||
checking the first chunk. Prevents over-admission and KV cache thrashing
|
||||
with chunked prefill."""
|
||||
|
||||
async_scheduling: bool | None = Field(default=None)
|
||||
async_scheduling: bool | None = None
|
||||
"""If set to False, disable async scheduling. Async scheduling helps to
|
||||
avoid gaps in GPU utilization, leading to better latency and throughput.
|
||||
"""
|
||||
|
||||
+14
-4
@@ -11,13 +11,13 @@ import os
|
||||
import pathlib
|
||||
import textwrap
|
||||
from collections.abc import Callable, Mapping, Sequence, Set
|
||||
from dataclasses import MISSING, dataclass, field, fields, is_dataclass
|
||||
from dataclasses import MISSING, field, fields, is_dataclass
|
||||
from itertools import pairwise
|
||||
from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast
|
||||
from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast, overload
|
||||
|
||||
import torch
|
||||
from pydantic import ConfigDict
|
||||
from pydantic.dataclasses import dataclass as pydantic_dataclass
|
||||
from pydantic.dataclasses import dataclass
|
||||
from pydantic.fields import Field as PydanticField
|
||||
from pydantic.fields import FieldInfo
|
||||
from typing_extensions import dataclass_transform, runtime_checkable
|
||||
@@ -36,6 +36,16 @@ ConfigType = type[DataclassInstance]
|
||||
ConfigT = TypeVar("ConfigT", bound=DataclassInstance)
|
||||
|
||||
|
||||
@overload
|
||||
def config(cls: type[ConfigT]) -> type[ConfigT]: ...
|
||||
|
||||
|
||||
@overload
|
||||
def config(
|
||||
*, config: ConfigDict | None = None, **kwargs: Any
|
||||
) -> Callable[[type[ConfigT]], type[ConfigT]]: ...
|
||||
|
||||
|
||||
@dataclass_transform(field_specifiers=(PydanticField,))
|
||||
def config(
|
||||
cls: type[ConfigT] | None = None,
|
||||
@@ -59,7 +69,7 @@ def config(
|
||||
merged_config.update(config)
|
||||
|
||||
def decorator(cls: type[ConfigT]) -> type[ConfigT]:
|
||||
return pydantic_dataclass(cls, config=merged_config, **kwargs) # type: ignore[return-value]
|
||||
return dataclass(cls, config=merged_config, **kwargs) # type: ignore[return-value]
|
||||
|
||||
# Called with arguments: @config(config=...)
|
||||
if cls is None:
|
||||
|
||||
+9
-14
@@ -246,15 +246,15 @@ OPTIMIZATION_LEVEL_TO_CONFIG = {
|
||||
}
|
||||
|
||||
|
||||
@config(config=ConfigDict(arbitrary_types_allowed=True)) # type: ignore[arg-type,misc]
|
||||
class VllmConfig: # type: ignore[misc]
|
||||
@config(config=ConfigDict(arbitrary_types_allowed=True))
|
||||
class VllmConfig:
|
||||
"""Dataclass which contains all vllm-related configuration. This
|
||||
simplifies passing around the distinct configurations in the codebase.
|
||||
"""
|
||||
|
||||
# TODO: use default_factory once default constructing ModelConfig doesn't
|
||||
# try to download a model
|
||||
model_config: ModelConfig = Field(default=None) # type: ignore[assignment]
|
||||
model_config: ModelConfig = None # type: ignore[assignment]
|
||||
"""Model configuration."""
|
||||
cache_config: CacheConfig = Field(default_factory=CacheConfig)
|
||||
"""Cache configuration."""
|
||||
@@ -912,7 +912,8 @@ class VllmConfig: # type: ignore[misc]
|
||||
|
||||
tp_size = self.parallel_config.tensor_parallel_size
|
||||
hidden_size = self.model_config.get_hidden_size()
|
||||
element_size = self.model_config.dtype.itemsize # type: ignore[union-attr]
|
||||
assert isinstance(self.model_config.dtype, torch.dtype)
|
||||
element_size = self.model_config.dtype.itemsize
|
||||
pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
|
||||
hidden_size, tp_size, element_size
|
||||
)
|
||||
@@ -1246,14 +1247,6 @@ class VllmConfig: # type: ignore[misc]
|
||||
)
|
||||
self.compilation_config.debug_dump_path = env_path
|
||||
|
||||
def has_blocked_weights(): # type: ignore[no-redef]
|
||||
if self.quant_config is not None:
|
||||
if hasattr(self.quant_config, "weight_block_size"):
|
||||
return self.quant_config.weight_block_size is not None
|
||||
elif hasattr(self.quant_config, "has_blocked_weights"):
|
||||
return self.quant_config.has_blocked_weights()
|
||||
return False
|
||||
|
||||
# Enable quant_fp8 CUDA ops (TODO disable in follow up)
|
||||
# On H100 the CUDA kernel is faster than
|
||||
# native implementation
|
||||
@@ -1502,9 +1495,10 @@ class VllmConfig: # type: ignore[misc]
|
||||
tp_size = self.parallel_config.tensor_parallel_size
|
||||
max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
|
||||
if max_size is not None:
|
||||
assert isinstance(self.model_config.dtype, torch.dtype)
|
||||
max_token_num = max_size // (
|
||||
self.model_config.get_hidden_size()
|
||||
* self.model_config.dtype.itemsize # type: ignore[union-attr]
|
||||
* self.model_config.dtype.itemsize
|
||||
)
|
||||
if compile_range_end is not None and max_token_num < compile_range_end:
|
||||
computed_compile_ranges_endpoints.append(max_token_num)
|
||||
@@ -1527,7 +1521,8 @@ class VllmConfig: # type: ignore[misc]
|
||||
|
||||
tp_size = self.parallel_config.tensor_parallel_size
|
||||
hidden_size = self.model_config.get_hidden_size()
|
||||
element_size = self.model_config.dtype.itemsize # type: ignore[union-attr]
|
||||
assert isinstance(self.model_config.dtype, torch.dtype)
|
||||
element_size = self.model_config.dtype.itemsize
|
||||
pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
|
||||
hidden_size, tp_size, element_size
|
||||
)
|
||||
|
||||
@@ -1935,7 +1935,7 @@ class EngineArgs:
|
||||
)
|
||||
|
||||
offload_config = OffloadConfig(
|
||||
offload_backend=self.offload_backend, # type: ignore[arg-type]
|
||||
offload_backend=self.offload_backend,
|
||||
uva=UVAOffloadConfig(
|
||||
cpu_offload_gb=self.cpu_offload_gb,
|
||||
cpu_offload_params=self.cpu_offload_params,
|
||||
|
||||
@@ -409,6 +409,11 @@ class LLM:
|
||||
# Cache for __repr__ to avoid repeated collective_rpc calls
|
||||
self._cached_repr: str | None = None
|
||||
|
||||
@classmethod
|
||||
def from_engine_args(cls, engine_args: EngineArgs) -> "LLM":
|
||||
"""Create an LLM instance from EngineArgs."""
|
||||
return cls(**vars(engine_args))
|
||||
|
||||
def get_tokenizer(self) -> TokenizerLike:
|
||||
return self.llm_engine.get_tokenizer()
|
||||
|
||||
|
||||
@@ -28,7 +28,10 @@ class ModelArchConfigConvertorBase:
|
||||
self.hf_text_config = hf_text_config
|
||||
|
||||
def get_architectures(self) -> list[str]:
|
||||
return getattr(self.hf_config, "architectures", [])
|
||||
# Sometimes we get here from `vllm_config.with_hf_config(text_config)` where
|
||||
# `text_config` is a sub-config from a multi-modal model. If this is the case,
|
||||
# the sub-config will not have `architectures` and it will explicitly be `None`
|
||||
return getattr(self.hf_config, "architectures", None) or []
|
||||
|
||||
def get_num_hidden_layers(self) -> int:
|
||||
return getattr(self.hf_text_config, "num_hidden_layers", 0)
|
||||
@@ -128,7 +131,7 @@ class ModelArchConfigConvertorBase:
|
||||
hf_config: PretrainedConfig,
|
||||
model_id: str,
|
||||
revision: str | None,
|
||||
config_format: ConfigFormat,
|
||||
config_format: str | ConfigFormat,
|
||||
):
|
||||
# NOTE: getattr(config, "dtype", torch.float32) is not correct
|
||||
# because config.dtype can be None.
|
||||
|
||||
Reference in New Issue
Block a user