mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
Speed up docs build (#44635)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -103,6 +103,8 @@ pre-commit run mypy-3.10 --all-files --hook-stage manual
|
||||
|
||||
The line length limit for Python code is 88 characters. If you are not sure, use pre-commit to check.
|
||||
|
||||
Use [Google-style docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) (`Args:`/`Returns:`/`Raises:` sections), not reStructuredText/Sphinx fields (`:param:`, `:return:`, `:rtype:`).
|
||||
|
||||
### Commit messages
|
||||
|
||||
Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
|
||||
|
||||
@@ -169,7 +169,7 @@ speculative decoding, breaking down the guarantees into three key areas:
|
||||
> distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
|
||||
> - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
|
||||
> without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
|
||||
> provides a lossless guarantee. Almost all of the tests in [tests/spec_decode/e2e](/tests/v1/spec_decode).
|
||||
> provides a lossless guarantee. Almost all of the tests in [tests/spec_decode/e2e](../../../tests/v1/spec_decode).
|
||||
> verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)
|
||||
|
||||
3. **vLLM Logprob Stability**
|
||||
|
||||
+9
-9
@@ -83,22 +83,22 @@ plugins:
|
||||
- "re:vllm\\._.*" # Internal modules
|
||||
- "vllm.third_party"
|
||||
- "vllm.vllm_flash_attn"
|
||||
- "re:vllm\\.grpc\\..*_pb2.*" # Auto-generated protobuf files
|
||||
- "vllm.transformers_utils.configs"
|
||||
- "vllm.transformers_utils.processors"
|
||||
- !ENV [API_AUTONAV_EXCLUDE, "re:^$"] # Match nothing by default
|
||||
- mkdocstrings:
|
||||
handlers:
|
||||
python:
|
||||
options:
|
||||
show_symbol_type_heading: true
|
||||
show_symbol_type_toc: true
|
||||
filters:
|
||||
- "!.*_pb2_grpc" # Exclude auto-generated gRPC stubs
|
||||
summary:
|
||||
modules: true
|
||||
show_signature_annotations: true
|
||||
separate_signature: true
|
||||
filters: []
|
||||
show_overloads: true
|
||||
signature_crossrefs: true
|
||||
# Recommendations from api-autonav
|
||||
docstring_section_style: list
|
||||
parameter_headings: true
|
||||
show_symbol_type_heading: true
|
||||
show_symbol_type_toc: true
|
||||
summary: true
|
||||
inventories:
|
||||
- https://docs.python.org/3/objects.inv
|
||||
- https://typing-extensions.readthedocs.io/en/latest/objects.inv
|
||||
|
||||
+10
-6
@@ -868,9 +868,10 @@ def cutlass_scaled_mm_azp(
|
||||
bias: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
:param azp_adj: In the per-tensor case, this should include the azp.
|
||||
Always per-channel.
|
||||
:param azp: Only set in the per-token case. Per-token if set.
|
||||
Args:
|
||||
azp_adj: In the per-tensor case, this should include the azp.
|
||||
Always per-channel.
|
||||
azp: Only set in the per-token case. Per-token if set.
|
||||
"""
|
||||
assert b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0
|
||||
assert out_dtype is torch.bfloat16 or out_dtype is torch.float16
|
||||
@@ -3886,9 +3887,12 @@ def hadacore_transform(x: torch.Tensor, inplace: bool = True) -> torch.Tensor:
|
||||
Note that sylvester hadamard transforms are also symmetric, which means that
|
||||
this function is also applies the (transpose <=> inverse) transform.
|
||||
|
||||
:param x: value to be transformed inplace
|
||||
:param inplace: modify value in place
|
||||
:return: value after transformation
|
||||
Args:
|
||||
x: value to be transformed inplace
|
||||
inplace: modify value in place
|
||||
|
||||
Returns:
|
||||
value after transformation
|
||||
"""
|
||||
return torch.ops._C.hadacore_transform(x, inplace)
|
||||
|
||||
|
||||
@@ -82,11 +82,12 @@ class InductorPass(CustomGraphPass): # type: ignore[misc]
|
||||
def hash_source(*srcs: str | Any) -> str:
|
||||
"""
|
||||
Utility method to hash the sources of functions or objects.
|
||||
:param srcs: strings or objects to add to the hash.
|
||||
Objects and functions have their source inspected.
|
||||
Results are cached by resolved types to avoid repeated
|
||||
inspect.getsource() calls.
|
||||
:return:
|
||||
|
||||
Args:
|
||||
srcs: strings or objects to add to the hash.
|
||||
Objects and functions have their source inspected.
|
||||
Results are cached by resolved types to avoid repeated
|
||||
inspect.getsource() calls.
|
||||
"""
|
||||
# Resolve instances to their class for a hashable cache key.
|
||||
cache_key = tuple(
|
||||
@@ -99,7 +100,9 @@ class InductorPass(CustomGraphPass): # type: ignore[misc]
|
||||
def hash_dict(dict_: dict[Any, Any]) -> str:
|
||||
"""
|
||||
Utility method to hash a dictionary, can alternatively be used for uuid.
|
||||
:return: A sha256 hash of the json rep of the dictionary.
|
||||
|
||||
Returns:
|
||||
A sha256 hash of the json rep of the dictionary.
|
||||
"""
|
||||
encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
|
||||
return hashlib.sha256(encoded).hexdigest()
|
||||
|
||||
@@ -276,9 +276,11 @@ class FixFunctionalizationPass(VllmInductorPass):
|
||||
"""
|
||||
Replace mutated getitem users of the auto-functionalized node with the
|
||||
mutated arguments.
|
||||
:param node: The auto-functionalized node
|
||||
:param mutated_args: The mutated arguments, indexed by getitem index.
|
||||
If the value of an arg is a string, `node.kwargs[arg]` is used.
|
||||
|
||||
Args:
|
||||
node: The auto-functionalized node
|
||||
mutated_args: The mutated arguments, indexed by getitem index.
|
||||
If the value of an arg is a string, `node.kwargs[arg]` is used.
|
||||
"""
|
||||
for idx, user in self.getitem_users(node).items():
|
||||
# Some functionalized nodes may return both a result at getitem[0]
|
||||
@@ -317,10 +319,11 @@ class FixFunctionalizationPass(VllmInductorPass):
|
||||
as node.kwargs cannot be used.
|
||||
See https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351
|
||||
|
||||
:param graph: Graph to insert the defunctionalized node into
|
||||
:param node: The auto-functionalized node to defunctionalize
|
||||
:param args: If we cannot use kwargs, specify args directly.
|
||||
If an arg is a string, `node.kwargs[arg]` is used.
|
||||
Args:
|
||||
graph: Graph to insert the defunctionalized node into
|
||||
node: The auto-functionalized node to defunctionalize
|
||||
args: If we cannot use kwargs, specify args directly.
|
||||
If an arg is a string, `node.kwargs[arg]` is used.
|
||||
""" # noqa: E501
|
||||
assert is_func(node, auto_functionalized), (
|
||||
f"node must be auto-functionalized, is {node} instead"
|
||||
|
||||
@@ -108,9 +108,13 @@ class NoOpEliminationPass(VllmInductorPass):
|
||||
def dims_equivalent(self, dim: int | SymInt, i_dim: int | SymInt) -> bool:
|
||||
"""
|
||||
This function checks if two dimensions are equivalent.
|
||||
:param dim: The dimension arg to reshape/slice
|
||||
:param i_dim: The corresponding dimension in the input tensor
|
||||
:return: Are the dimensions equivalent?
|
||||
|
||||
Args:
|
||||
dim: The dimension arg to reshape/slice
|
||||
i_dim: The corresponding dimension in the input tensor
|
||||
|
||||
Returns:
|
||||
Are the dimensions equivalent?
|
||||
|
||||
There are two cases in which the dimensions are equivalent:
|
||||
1. The dimensions are equal (both integers)
|
||||
|
||||
@@ -180,8 +180,9 @@ class CuMemAllocator:
|
||||
All data in the memory allocation with the specified tag will be
|
||||
offloaded to CPU memory, and others will be discarded.
|
||||
|
||||
:param offload_tags: The tags of the memory allocation that will be
|
||||
offloaded. The rest of the memory allocation will be discarded.
|
||||
Args:
|
||||
offload_tags: The tags of the memory allocation that will be
|
||||
offloaded. The rest of the memory allocation will be discarded.
|
||||
"""
|
||||
if offload_tags is None:
|
||||
# by default, allocated tensors are offloaded
|
||||
@@ -230,9 +231,10 @@ class CuMemAllocator:
|
||||
All data that is previously offloaded will be loaded back to GPU
|
||||
memory, and the rest of the data will have empty memory.
|
||||
|
||||
:param tags: The tags of the memory allocation that will be loaded
|
||||
back to GPU memory. If None, all memory allocation will be loaded
|
||||
back to GPU memory.
|
||||
Args:
|
||||
tags: The tags of the memory allocation that will be loaded
|
||||
back to GPU memory. If None, all memory allocation will be loaded
|
||||
back to GPU memory.
|
||||
"""
|
||||
for ptr, data in self.pointer_to_data.items():
|
||||
if tags is None or data.tag in tags:
|
||||
@@ -255,8 +257,9 @@ class CuMemAllocator:
|
||||
All memory allocation created inside the context will be allocated
|
||||
in the memory pool, and has the specified tag.
|
||||
|
||||
:param tag: The tag of the memory allocation. If None, the default tag
|
||||
will be used.
|
||||
Args:
|
||||
tag: The tag of the memory allocation. If None, the default tag
|
||||
will be used.
|
||||
"""
|
||||
if tag is None:
|
||||
tag = CuMemAllocator.default_tag
|
||||
|
||||
@@ -132,7 +132,8 @@ class KVEventAggregator:
|
||||
"""
|
||||
Add events from a worker batch.
|
||||
|
||||
:param events: List of KVCacheEvent objects.
|
||||
Args:
|
||||
events: List of KVCacheEvent objects.
|
||||
"""
|
||||
if not isinstance(events, list):
|
||||
raise TypeError("events must be a list of KVCacheEvent.")
|
||||
@@ -142,7 +143,8 @@ class KVEventAggregator:
|
||||
"""
|
||||
Return events that appeared in all workers.
|
||||
|
||||
:return: List of events present in all workers.
|
||||
Returns:
|
||||
List of events present in all workers.
|
||||
"""
|
||||
return [
|
||||
event
|
||||
@@ -154,7 +156,8 @@ class KVEventAggregator:
|
||||
"""
|
||||
Return all events for all workers.
|
||||
|
||||
:return: List of events for all workers.
|
||||
Returns:
|
||||
List of events for all workers.
|
||||
"""
|
||||
return list(self._event_counter.elements())
|
||||
|
||||
@@ -168,7 +171,8 @@ class KVEventAggregator:
|
||||
"""
|
||||
Increment the number of workers contributing events.
|
||||
|
||||
:param count: Number to increment the workers by.
|
||||
Args:
|
||||
count: Number to increment the workers by.
|
||||
"""
|
||||
if count <= 0:
|
||||
raise ValueError("count must be positive.")
|
||||
@@ -184,7 +188,8 @@ class KVEventAggregator:
|
||||
"""
|
||||
Return the number of workers.
|
||||
|
||||
:return: int number of workers.
|
||||
Returns:
|
||||
int number of workers.
|
||||
"""
|
||||
return self._num_workers
|
||||
|
||||
|
||||
@@ -439,13 +439,12 @@ def _init_lmcache_engine(
|
||||
`LMCACHE_CONFIG_FILE` to load the configuration file. If that environment
|
||||
variable is not set, this function will return None.
|
||||
|
||||
:param lmcache_config: The LMCache configuration.
|
||||
:type lmcache_config: LMCacheEngineConfig
|
||||
:param vllm_config: The vLLM configuration.
|
||||
:type vllm_config: VllmConfig
|
||||
Args:
|
||||
lmcache_config: The LMCache configuration.
|
||||
vllm_config: The vLLM configuration.
|
||||
|
||||
:return: The initialized LMCache engine
|
||||
:rtype: LMCacheEngine
|
||||
Returns:
|
||||
The initialized LMCache engine
|
||||
"""
|
||||
if curr_engine := LMCacheEngineBuilder.get(ENGINE_NAME):
|
||||
return curr_engine
|
||||
|
||||
+17
-11
@@ -113,11 +113,14 @@ def register_op(
|
||||
"""
|
||||
Register a new vLLM IR op.
|
||||
|
||||
:param f: the native implementation of the op
|
||||
:param name: the name of the op, defaults to the function name
|
||||
:param activations: list of activation params, defaults to params starting with 'x'
|
||||
:param allow_inplace: add a maybe_inplace overload that allows inplace impls
|
||||
:return: the IrOp object if f is provided, otherwise a decorator
|
||||
Args:
|
||||
f: the native implementation of the op
|
||||
name: the name of the op, defaults to the function name
|
||||
activations: list of activation params, defaults to params starting with 'x'
|
||||
allow_inplace: add a maybe_inplace overload that allows inplace impls
|
||||
|
||||
Returns:
|
||||
the IrOp object if f is provided, otherwise a decorator
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
@@ -245,14 +248,17 @@ class IrOp:
|
||||
supported: bool = True,
|
||||
supports_args: Callable[..., bool] | None = None,
|
||||
inplace: bool = False,
|
||||
):
|
||||
) -> Callable[[Callable[..., Any]], "IrOpImpl"]:
|
||||
"""
|
||||
Register an implementation for this custom op.
|
||||
:param provider: The name of the provider, must be unique.
|
||||
:param supported: Static support check, use this to check platform support.
|
||||
:param supports_args: Dynamic arg support check, used for types and shapes.
|
||||
:param inplace: Does this op reuse activation input memory for outputs
|
||||
:return: A decorator that registers the implementation.
|
||||
Args:
|
||||
provider: The name of the provider, must be unique.
|
||||
supported: Static support check, use this to check platform support.
|
||||
supports_args: Dynamic arg support check, used for types and shapes.
|
||||
inplace: Does this op reuse activation input memory for outputs
|
||||
|
||||
Returns:
|
||||
A decorator that registers the implementation.
|
||||
|
||||
The decorated function must have the same semantics and signature as
|
||||
the native implementation.
|
||||
|
||||
+3
-3
@@ -12,9 +12,9 @@ from typing import Any
|
||||
def hash_source(*srcs: str | Any) -> str:
|
||||
"""
|
||||
Utility method to hash the sources of functions or objects.
|
||||
:param srcs: strings or objects to add to the hash.
|
||||
Objects and functions have their source inspected.
|
||||
:return:
|
||||
Args:
|
||||
srcs: strings or objects to add to the hash.
|
||||
Objects and functions have their source inspected.
|
||||
"""
|
||||
hasher = hashlib.sha256()
|
||||
for src in srcs:
|
||||
|
||||
@@ -599,12 +599,14 @@ class FusedMoE(PluggableLayer):
|
||||
):
|
||||
"""
|
||||
Load grouped weight scales for group quantization or model weights
|
||||
:param shard_dim: dimension to shard
|
||||
:param expert_data: parameter for a particular expert
|
||||
:param shard_id: either w1, w2, or w3
|
||||
:param loaded_weight: checkpoint weight to load into the param
|
||||
:param tp_rank: tensor parallel rank
|
||||
:param load_full_w2: whether or not the w2 loaded should be sharded.
|
||||
|
||||
Args:
|
||||
shard_dim: dimension to shard
|
||||
expert_data: parameter for a particular expert
|
||||
shard_id: either w1, w2, or w3
|
||||
loaded_weight: checkpoint weight to load into the param
|
||||
tp_rank: tensor parallel rank
|
||||
load_full_w2: whether or not the w2 loaded should be sharded.
|
||||
"""
|
||||
if shard_id == "w2":
|
||||
# In the case where we have actorder/g_idx, we do not partition the
|
||||
|
||||
@@ -178,8 +178,9 @@ class QuantizationConfig(ABC):
|
||||
Interface for models to update module names referenced in
|
||||
quantization configs in order to reflect the vllm model structure
|
||||
|
||||
:param hf_to_vllm_mapper: maps from hf model structure (the assumed
|
||||
structure of the qconfig) to vllm model structure
|
||||
Args:
|
||||
hf_to_vllm_mapper: maps from hf model structure (the assumed
|
||||
structure of the qconfig) to vllm model structure
|
||||
"""
|
||||
# TODO (@kylesayrs): add implementations for all subclasses
|
||||
pass
|
||||
|
||||
@@ -267,8 +267,11 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
cls, config: dict[str, Any]
|
||||
) -> tuple[dict[str, SparsityCompressionConfig], list[str]]:
|
||||
"""
|
||||
:param config: The `quantization_config` dictionary from config.json
|
||||
:return: A tuple with two elements
|
||||
Args:
|
||||
config: The `quantization_config` dictionary from config.json
|
||||
|
||||
Returns:
|
||||
A tuple with two elements
|
||||
1. A dictionary mapping target layer names to their corresponding
|
||||
sparsity_config
|
||||
2. A list of layer names to ignore for sparsity
|
||||
@@ -296,8 +299,11 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
cls, config: dict[str, Any]
|
||||
) -> QUANTIZATION_SCHEME_MAP_TYPE:
|
||||
"""
|
||||
:param config: The `quantization_config` dictionary from config.json
|
||||
:return: A dictionary mapping target layer names to their corresponding
|
||||
Args:
|
||||
config: The `quantization_config` dictionary from config.json
|
||||
|
||||
Returns:
|
||||
A dictionary mapping target layer names to their corresponding
|
||||
quantization_args for weights and input activations
|
||||
"""
|
||||
target_scheme_map: dict[str, Any] = dict()
|
||||
@@ -967,7 +973,9 @@ class CompressedTensorsKVCacheMethod(BaseKVCacheMethod):
|
||||
"""
|
||||
Validator for the kv cache scheme. Useful for controlling the
|
||||
kv cache quantization schemes, that are being supported in vLLM
|
||||
:param kv_cache_scheme: the compressed-tensors kv cache scheme
|
||||
|
||||
Args:
|
||||
kv_cache_scheme: the compressed-tensors kv cache scheme
|
||||
"""
|
||||
if kv_cache_scheme is None:
|
||||
return
|
||||
|
||||
+5
-5
@@ -38,11 +38,11 @@ class CompressedTensorsScheme(ABC):
|
||||
Run the forward pass for the particular scheme. This is where
|
||||
scheme-specific dequant/quant steps/kernels should be applied.
|
||||
|
||||
:param layer: torch.nn.Module with the registered weights and
|
||||
other parameters relevant to the particular scheme.
|
||||
:param x: input to the layer
|
||||
:param bias: bias parameter
|
||||
|
||||
Args:
|
||||
layer: torch.nn.Module with the registered weights and
|
||||
other parameters relevant to the particular scheme.
|
||||
x: input to the layer
|
||||
bias: bias parameter
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
@@ -133,12 +133,11 @@ def find_matched_target(
|
||||
*All* component module names must match in order for a match to be
|
||||
successful. A successful match returns the first component target
|
||||
|
||||
:param layer_name: layer name
|
||||
:param module: torch.nn.Module
|
||||
:param targets: list of targets to match the layer against
|
||||
:param fused_mapping: map from fused layer names to its components
|
||||
:param fused_strategy: either "all" or "any". If using "all", fused
|
||||
layers match if "all" of its components match
|
||||
Args:
|
||||
layer_name: layer name
|
||||
module: torch.nn.Module
|
||||
targets: list of targets to match the layer against
|
||||
fused_mapping: map from fused layer names to its components
|
||||
"""
|
||||
|
||||
if layer_name is None:
|
||||
@@ -161,9 +160,10 @@ def _find_first_match(
|
||||
exactly or as a regex after 're:'. If check_contains is set to True,
|
||||
additionally checks if the target string is contained within the value.
|
||||
|
||||
:param value: string to compare the list of targets against
|
||||
:param targets: list of targets to match the layer against
|
||||
:param check_contains: whether or not to do a substring match
|
||||
Args:
|
||||
value: string to compare the list of targets against
|
||||
targets: list of targets to match the layer against
|
||||
check_contains: whether or not to do a substring match
|
||||
"""
|
||||
|
||||
for target in targets:
|
||||
@@ -205,9 +205,10 @@ def _match_fused_layer(
|
||||
Implements an "all" matching strategy where a fused layer matches iff
|
||||
"all" of its components match
|
||||
|
||||
:param layer_name: layer name
|
||||
:param target_layers: list of targets to match the layer against
|
||||
:param fused_mapping: map from fused layer names to its components
|
||||
Args:
|
||||
layer_name: layer name
|
||||
target_layers: list of targets to match the layer against
|
||||
fused_mapping: map from fused layer names to its components
|
||||
|
||||
Examples:
|
||||
layer_name = "model.layers.0.self_attn.qkv_proj"
|
||||
|
||||
@@ -114,8 +114,9 @@ class GGUFConfig(QuantizationConfig):
|
||||
Interface for models to update module names referenced in
|
||||
quantization configs in order to reflect the vllm model structure
|
||||
|
||||
:param hf_to_vllm_mapper: maps from hf model structure (the assumed
|
||||
structure of the qconfig) to vllm model structure
|
||||
Args:
|
||||
hf_to_vllm_mapper: maps from hf model structure (the assumed
|
||||
structure of the qconfig) to vllm model structure
|
||||
"""
|
||||
if self.unquantized_modules is not None:
|
||||
self.unquantized_modules = hf_to_vllm_mapper.apply_list(
|
||||
|
||||
@@ -46,16 +46,17 @@ class QuantFP8(CustomOp):
|
||||
compile_native: bool = True,
|
||||
):
|
||||
"""
|
||||
:param static: static or dynamic quantization
|
||||
:param group_shape: quantization group shape (PER_TOKEN, PER_TENSOR,
|
||||
PER_CHANNEL, or arbitrary block size)
|
||||
:param num_token_padding: Pad the token dimension of output to this
|
||||
size
|
||||
:param tma_aligned_scales: For group quantization, output scales in
|
||||
TMA-aligned layout
|
||||
:param column_major_scales: For group quantization, output scales in
|
||||
column major format
|
||||
:param compile_native: Manually compile forward_native if compile mode > None
|
||||
Args:
|
||||
static: static or dynamic quantization
|
||||
group_shape: quantization group shape (PER_TOKEN, PER_TENSOR,
|
||||
PER_CHANNEL, or arbitrary block size)
|
||||
num_token_padding: Pad the token dimension of output to this
|
||||
size
|
||||
tma_aligned_scales: For group quantization, output scales in
|
||||
TMA-aligned layout
|
||||
column_major_scales: For group quantization, output scales in
|
||||
column major format
|
||||
compile_native: Manually compile forward_native if compile mode > None
|
||||
"""
|
||||
super().__init__(compile_native=compile_native)
|
||||
self.static = static
|
||||
|
||||
@@ -47,7 +47,8 @@ class BaseKVCacheMethod(QuantizeMethodBase):
|
||||
- quantize k/v_cache entries before saving them to the cache
|
||||
- dequantize k/v_cache entries before fetching them from the cache
|
||||
|
||||
:param quant_config: the appropriate QuantizationConfig
|
||||
Args:
|
||||
quant_config: the appropriate QuantizationConfig
|
||||
"""
|
||||
|
||||
def __init__(self, quant_config: QuantizationConfig):
|
||||
|
||||
@@ -87,8 +87,9 @@ class QuarkConfig(QuantizationConfig):
|
||||
Interface for models to update module names referenced in
|
||||
quantization configs in order to reflect the vllm model structure
|
||||
|
||||
:param hf_to_vllm_mapper: maps from hf model structure (the assumed
|
||||
structure of the qconfig) to vllm model structure
|
||||
Args:
|
||||
hf_to_vllm_mapper: maps from hf model structure (the assumed
|
||||
structure of the qconfig) to vllm model structure
|
||||
"""
|
||||
quant_config_with_hf_to_vllm_mapper: dict[str, Any] = {}
|
||||
|
||||
@@ -724,7 +725,9 @@ class QuarkKVCacheMethod(BaseKVCacheMethod):
|
||||
"""
|
||||
Validator for the kv cache configuration. Useful for controlling the
|
||||
kv cache quantization schemes, that are being supported in vLLM
|
||||
:param kv_cache_config: the quark kv cache scheme
|
||||
|
||||
Args:
|
||||
kv_cache_config: the quark kv cache scheme
|
||||
"""
|
||||
if kv_cache_config is None:
|
||||
return
|
||||
|
||||
@@ -38,11 +38,11 @@ class QuarkScheme(ABC):
|
||||
Run the forward pass for the particular scheme. This is where
|
||||
scheme-specific dequant/quant steps/kernels should be applied.
|
||||
|
||||
:param layer: torch.nn.Module with the registered weights and
|
||||
other parameters relevant to the particular scheme.
|
||||
:param x: input to the layer
|
||||
:param bias: bias parameter
|
||||
|
||||
Args:
|
||||
layer: torch.nn.Module with the registered weights and
|
||||
other parameters relevant to the particular scheme.
|
||||
x: input to the layer
|
||||
bias: bias parameter
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@@ -123,7 +123,8 @@ def initialize_online_processing(layer: torch.nn.Module):
|
||||
Called by either `initialize_layerwise_reload` or an online quantization scheme,
|
||||
prevents double wrapping in the case of online quantization + reloading
|
||||
|
||||
:param layer: layer whose parameter weight loaders will be wrapped
|
||||
Args:
|
||||
layer: layer whose parameter weight loaders will be wrapped
|
||||
"""
|
||||
info = get_layerwise_info(layer)
|
||||
|
||||
@@ -222,8 +223,9 @@ def finalize_layerwise_processing(model: torch.nn.Module, model_config: ModelCon
|
||||
This function should be applied after `initialize_layerwise_reload` is applied
|
||||
unwrap the layerwise weight loaders.
|
||||
|
||||
:param model: model to finalize processing for
|
||||
:param model_config: config needed for applying processing to attention layers
|
||||
Args:
|
||||
model: model to finalize processing for
|
||||
model_config: config needed for applying processing to attention layers
|
||||
"""
|
||||
if hasattr(model, "_original_do_torchao_reload"):
|
||||
model._do_torchao_reload = model._original_do_torchao_reload
|
||||
|
||||
@@ -175,9 +175,12 @@ def get_numel_loaded(
|
||||
"""
|
||||
Determine how many elements would be loaded by a weight loader call.
|
||||
|
||||
:param weight loader: used to load weights
|
||||
:param args: bound arguments to weight loader
|
||||
:return: number of elements loaded by the weight loader, the return value of the
|
||||
Args:
|
||||
weight_loader: used to load weights
|
||||
args: bound arguments to weight loader
|
||||
|
||||
Returns:
|
||||
number of elements loaded by the weight loader, the return value of the
|
||||
weight loader
|
||||
"""
|
||||
with CopyCounter() as counter:
|
||||
|
||||
@@ -20,9 +20,12 @@ def sanitize_layer_refs(tensor: torch.Tensor, layer: torch.nn.Module) -> torch.T
|
||||
tensors will reference layers, and the WeakKeyDictionary will never evict entries,
|
||||
even when the model is deleted.
|
||||
|
||||
:param tensor: tensor to be sanitized
|
||||
:param layer: layer whose references should be removed
|
||||
:return: sanitized tensor
|
||||
Args:
|
||||
tensor: tensor to be sanitized
|
||||
layer: layer whose references should be removed
|
||||
|
||||
Returns:
|
||||
sanitized tensor
|
||||
"""
|
||||
for key, value in tensor.__dict__.items():
|
||||
if isinstance(value, MethodType) and value.__self__ is layer:
|
||||
@@ -38,10 +41,12 @@ def restore_layer_refs(tensor: torch.Tensor, layer: torch.nn.Module) -> torch.Te
|
||||
Used by `restore_layer_on_meta` to add back layer references, allowing for proper
|
||||
weight loading.
|
||||
|
||||
:param tensor: tensor to be sanitized
|
||||
:param layer: layer whose references should be removed
|
||||
:return: sanitized tensor
|
||||
Args:
|
||||
tensor: tensor to be sanitized
|
||||
layer: layer whose references should be removed
|
||||
|
||||
Returns:
|
||||
sanitized tensor
|
||||
"""
|
||||
for key, value in tensor.__dict__.items():
|
||||
if isinstance(value, MethodType) and value.__self__ is layer_ref_sentinel:
|
||||
|
||||
@@ -49,8 +49,11 @@ def has_device_tensors(bound_args: BoundArguments) -> bool:
|
||||
"""
|
||||
Return True if the loaded weights exist on an accelerator device
|
||||
|
||||
:param bound_args: args to load weights
|
||||
:return: True if weights are on accelerator device
|
||||
Args:
|
||||
bound_args: args to load weights
|
||||
|
||||
Returns:
|
||||
True if weights are on accelerator device
|
||||
"""
|
||||
return any(
|
||||
isinstance(value, torch.Tensor) and value.device.type not in ("meta", "cpu")
|
||||
@@ -62,8 +65,11 @@ def get_info_size(info: LayerReloadingInfo) -> int:
|
||||
"""
|
||||
Calculate the number of bytes used by loaded weights for a given layer
|
||||
|
||||
:param info: layerwise info to get size of
|
||||
:return: number of bytes used by loaded weights
|
||||
Args:
|
||||
info: layerwise info to get size of
|
||||
|
||||
Returns:
|
||||
number of bytes used by loaded weights
|
||||
"""
|
||||
return sum(
|
||||
value.nbytes
|
||||
|
||||
@@ -277,7 +277,8 @@ class OlmoModel(nn.Module):
|
||||
inputs_embeds: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | IntermediateTensors:
|
||||
"""
|
||||
:param input_ids: A tensor of shape `(batch_size, seq_len)`.
|
||||
Args:
|
||||
input_ids: A tensor of shape `(batch_size, seq_len)`.
|
||||
"""
|
||||
if get_pp_group().is_first_rank:
|
||||
if inputs_embeds is not None:
|
||||
|
||||
@@ -314,7 +314,8 @@ class Olmo2Model(nn.Module):
|
||||
inputs_embeds: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | IntermediateTensors:
|
||||
"""
|
||||
:param input_ids: A tensor of shape `(batch_size, seq_len)`.
|
||||
Args:
|
||||
input_ids: A tensor of shape `(batch_size, seq_len)`.
|
||||
"""
|
||||
if get_pp_group().is_first_rank:
|
||||
if inputs_embeds is not None:
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
from collections.abc import Callable, Hashable
|
||||
from fractions import Fraction
|
||||
from typing import Any
|
||||
from weakref import WeakValueDictionary
|
||||
|
||||
import torch
|
||||
@@ -42,10 +43,9 @@ class BasevLLMParameter(Parameter):
|
||||
"""
|
||||
Initialize the BasevLLMParameter
|
||||
|
||||
:param data: torch tensor with the parameter data
|
||||
:param weight_loader: weight loader callable
|
||||
|
||||
:returns: a torch.nn.parameter
|
||||
Args:
|
||||
data: torch tensor with the parameter data
|
||||
weight_loader: weight loader callable
|
||||
"""
|
||||
|
||||
# During weight loading, we often do something like:
|
||||
@@ -445,15 +445,16 @@ class SharedWeightParameter(BasevLLMParameter):
|
||||
"currently support tensor parallelism"
|
||||
)
|
||||
|
||||
def add_partition(self, index: int, data_key: Hashable, *args, **kwargs):
|
||||
def add_partition(self, index: int, data_key: Hashable, *args: Any, **kwargs: Any):
|
||||
"""
|
||||
Add a partition to the weight parameter. Partitions whose `data_key`
|
||||
is the same will share tensor data
|
||||
|
||||
:param index: index of partition to add
|
||||
:param data_key: hashable key used to key shared tensors
|
||||
:param *args: arguments for `torch.empty`
|
||||
:param **kwargs: keyword arguments for `torch.empty`
|
||||
Args:
|
||||
index: index of partition to add
|
||||
data_key: hashable key used to key shared tensors
|
||||
*args: arguments for `torch.empty`
|
||||
**kwargs: keyword arguments for `torch.empty`
|
||||
"""
|
||||
# load (shared) tensor using `data_key`
|
||||
if data_key not in self.tensors_registry:
|
||||
|
||||
@@ -84,8 +84,11 @@ def maybe_model_redirect(model: str) -> str:
|
||||
"""
|
||||
Use model_redirect to redirect the model name to a local folder.
|
||||
|
||||
:param model: hf model name
|
||||
:return: maybe redirect to a local folder
|
||||
Args:
|
||||
model: hf model name
|
||||
|
||||
Returns:
|
||||
maybe redirect to a local folder
|
||||
"""
|
||||
|
||||
model_redirect_path = envs.VLLM_MODEL_REDIRECT_PATH
|
||||
|
||||
@@ -808,14 +808,17 @@ class AttentionImpl(AttentionImplBase[T], Generic[T]):
|
||||
) -> torch.Tensor:
|
||||
raise NotImplementedError
|
||||
|
||||
def fused_output_quant_supported(self, quant_key: "QuantKey"):
|
||||
def fused_output_quant_supported(self, quant_key: "QuantKey") -> bool:
|
||||
"""
|
||||
Does this attention implementation support fused output quantization.
|
||||
This is used by the AttnFusionPass to only fuse output quantization
|
||||
onto implementations that support it.
|
||||
|
||||
:param quant_key: QuantKey object that describes the quantization op
|
||||
:return: is fusion supported for this type of quantization
|
||||
Args:
|
||||
quant_key: QuantKey object that describes the quantization op
|
||||
|
||||
Returns:
|
||||
is fusion supported for this type of quantization
|
||||
"""
|
||||
return False
|
||||
|
||||
|
||||
@@ -1878,9 +1878,8 @@ class GPUModelRunner(
|
||||
SpecDecodeMetadata | None,
|
||||
]:
|
||||
"""
|
||||
:return: tuple[
|
||||
logits_indices, spec_decode_metadata,
|
||||
]
|
||||
Returns:
|
||||
tuple[logits_indices, spec_decode_metadata]
|
||||
"""
|
||||
total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
||||
assert total_num_scheduled_tokens > 0
|
||||
@@ -2205,7 +2204,8 @@ class GPUModelRunner(
|
||||
slot_mappings: dict[int, torch.Tensor] | None = None,
|
||||
) -> tuple[PerLayerAttnMetadata, CommonAttentionMetadata | None]:
|
||||
"""
|
||||
:return: tuple[attn_metadata, spec_decode_common_attn_metadata]
|
||||
Returns:
|
||||
tuple[attn_metadata, spec_decode_common_attn_metadata]
|
||||
"""
|
||||
# Attention metadata is not needed for attention free models
|
||||
if len(self.kv_cache_config.kv_cache_groups) == 0:
|
||||
@@ -2503,9 +2503,11 @@ class GPUModelRunner(
|
||||
num_common_prefix_blocks: list[int],
|
||||
) -> list[list[int]] | None:
|
||||
"""
|
||||
:return: Optional[cascade_attn_prefix_lens]
|
||||
cascade_attn_prefix_lens is 2D: ``[kv_cache_group_id][attn_group_idx]``,
|
||||
None if we should not use cascade attention
|
||||
Returns:
|
||||
Optional[cascade_attn_prefix_lens]
|
||||
cascade_attn_prefix_lens is 2D:
|
||||
``[kv_cache_group_id][attn_group_idx]``,
|
||||
None if we should not use cascade attention
|
||||
"""
|
||||
|
||||
use_cascade_attn = False
|
||||
@@ -5324,11 +5326,12 @@ class GPUModelRunner(
|
||||
"""
|
||||
Reload weights from a weights iterator or from disk
|
||||
|
||||
:param weights_iterator: weights to load into model
|
||||
:param weights_path: path to load weights from if weights_iterator is not
|
||||
provided. Use path of original model if neither is provided.
|
||||
:param is_checkpoint_format: set to False if weights have already been processed
|
||||
into kernel format (repacking, renaming, etc.)
|
||||
Args:
|
||||
weights_iterator: weights to load into model
|
||||
weights_path: path to load weights from if weights_iterator is not
|
||||
provided. Use path of original model if neither is provided.
|
||||
is_checkpoint_format: set to False if weights have already been
|
||||
processed into kernel format (repacking, renaming, etc.)
|
||||
"""
|
||||
# TODO(@kylesayrs): generalize to all runners and loaders
|
||||
# argument validation
|
||||
|
||||
Reference in New Issue
Block a user