From a80af243560e25f915010011db76ae97c0cef5b5 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 5 Jun 2026 15:51:44 +0100
Subject: [PATCH] Speed up docs build (#44635)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 AGENTS.md                                     |  2 ++
 docs/features/speculative_decoding/README.md  |  2 +-
 mkdocs.yaml                                   | 18 ++++++------
 vllm/_custom_ops.py                           | 16 +++++++----
 vllm/compilation/passes/inductor_pass.py      | 15 ++++++----
 .../passes/utility/fix_functionalization.py   | 17 ++++++-----
 .../passes/utility/noop_elimination.py        | 10 +++++--
 vllm/device_allocator/cumem.py                | 17 ++++++-----
 vllm/distributed/kv_events.py                 | 15 ++++++----
 .../v1/lmcache_integration/vllm_v1_adapter.py | 11 ++++----
 vllm/ir/op.py                                 | 28 +++++++++++--------
 vllm/ir/util.py                               |  6 ++--
 vllm/model_executor/layers/fused_moe/layer.py | 14 ++++++----
 .../layers/quantization/base_config.py        |  5 ++--
 .../compressed_tensors/compressed_tensors.py  | 18 ++++++++----
 .../schemes/compressed_tensors_scheme.py      | 10 +++----
 .../quantization/compressed_tensors/utils.py  | 25 +++++++++--------
 .../layers/quantization/gguf.py               |  5 ++--
 .../layers/quantization/input_quant_fp8.py    | 21 +++++++-------
 .../layers/quantization/kv_cache.py           |  3 +-
 .../layers/quantization/quark/quark.py        |  9 ++++--
 .../quark/schemes/quark_scheme.py             | 10 +++----
 .../model_loader/reload/layerwise.py          |  8 ++++--
 .../model_loader/reload/meta.py               |  9 ++++--
 .../model_loader/reload/sanitize.py           | 17 +++++++----
 .../model_loader/reload/utils.py              | 14 +++++++---
 vllm/model_executor/models/olmo.py            |  3 +-
 vllm/model_executor/models/olmo2.py           |  3 +-
 vllm/model_executor/parameter.py              | 19 +++++++------
 vllm/transformers_utils/utils.py              |  7 +++--
 vllm/v1/attention/backend.py                  |  9 ++++--
 vllm/v1/worker/gpu_model_runner.py            | 27 ++++++++++--------
 32 files changed, 234 insertions(+), 159 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 6566523f48e..6c9ca6377a0 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -103,6 +103,8 @@ pre-commit run mypy-3.10 --all-files --hook-stage manual
 
 The line length limit for Python code is 88 characters. If you are not sure, use pre-commit to check.
 
+Use [Google-style docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) (`Args:`/`Returns:`/`Raises:` sections), not reStructuredText/Sphinx fields (`:param:`, `:return:`, `:rtype:`).
+
 ### Commit messages
 
 Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
diff --git a/docs/features/speculative_decoding/README.md b/docs/features/speculative_decoding/README.md
index 768e9f78d40..58d1df9dced 100644
--- a/docs/features/speculative_decoding/README.md
+++ b/docs/features/speculative_decoding/README.md
@@ -169,7 +169,7 @@ speculative decoding, breaking down the guarantees into three key areas:
     >   distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
     > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
     >   without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
-    >   provides a lossless guarantee. Almost all of the tests in [tests/spec_decode/e2e](/tests/v1/spec_decode).
+    >   provides a lossless guarantee. Almost all of the tests in [tests/spec_decode/e2e](../../../tests/v1/spec_decode).
     >   verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)
 
 3. **vLLM Logprob Stability**
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 1fee824f3b2..970bf963309 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -83,22 +83,22 @@ plugins:
         - "re:vllm\\._.*"  # Internal modules
         - "vllm.third_party"
         - "vllm.vllm_flash_attn"
-        - "re:vllm\\.grpc\\..*_pb2.*"  # Auto-generated protobuf files
+        - "vllm.transformers_utils.configs"
+        - "vllm.transformers_utils.processors"
         - !ENV [API_AUTONAV_EXCLUDE, "re:^$"]  # Match nothing by default
   - mkdocstrings:
       handlers:
         python:
           options:
-            show_symbol_type_heading: true
-            show_symbol_type_toc: true
-            filters:
-              - "!.*_pb2_grpc"  # Exclude auto-generated gRPC stubs
-            summary:
-              modules: true
-            show_signature_annotations: true
-            separate_signature: true
+            filters: []
             show_overloads: true
             signature_crossrefs: true
+            # Recommendations from api-autonav
+            docstring_section_style: list
+            parameter_headings: true
+            show_symbol_type_heading: true
+            show_symbol_type_toc: true
+            summary: true
           inventories:
           - https://docs.python.org/3/objects.inv
           - https://typing-extensions.readthedocs.io/en/latest/objects.inv
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index f12d128f083..bd8a19b6d2d 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -868,9 +868,10 @@ def cutlass_scaled_mm_azp(
     bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     """
-    :param azp_adj: In the per-tensor case, this should include the azp.
-    Always per-channel.
-    :param azp: Only set in the per-token case. Per-token if set.
+    Args:
+        azp_adj: In the per-tensor case, this should include the azp.
+            Always per-channel.
+        azp: Only set in the per-token case. Per-token if set.
     """
     assert b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0
     assert out_dtype is torch.bfloat16 or out_dtype is torch.float16
@@ -3886,9 +3887,12 @@ def hadacore_transform(x: torch.Tensor, inplace: bool = True) -> torch.Tensor:
     Note that sylvester hadamard transforms are also symmetric, which means that
     this function is also applies the (transpose <=> inverse) transform.
 
-    :param x: value to be transformed inplace
-    :param inplace: modify value in place
-    :return: value after transformation
+    Args:
+        x: value to be transformed inplace
+        inplace: modify value in place
+
+    Returns:
+        value after transformation
     """
     return torch.ops._C.hadacore_transform(x, inplace)
 
diff --git a/vllm/compilation/passes/inductor_pass.py b/vllm/compilation/passes/inductor_pass.py
index 8a0d5326dd9..29410f960cd 100644
--- a/vllm/compilation/passes/inductor_pass.py
+++ b/vllm/compilation/passes/inductor_pass.py
@@ -82,11 +82,12 @@ class InductorPass(CustomGraphPass):  # type: ignore[misc]
     def hash_source(*srcs: str | Any) -> str:
         """
         Utility method to hash the sources of functions or objects.
-        :param srcs: strings or objects to add to the hash.
-        Objects and functions have their source inspected.
-        Results are cached by resolved types to avoid repeated
-        inspect.getsource() calls.
-        :return:
+
+        Args:
+            srcs: strings or objects to add to the hash.
+                Objects and functions have their source inspected.
+                Results are cached by resolved types to avoid repeated
+                inspect.getsource() calls.
         """
         # Resolve instances to their class for a hashable cache key.
         cache_key = tuple(
@@ -99,7 +100,9 @@ class InductorPass(CustomGraphPass):  # type: ignore[misc]
     def hash_dict(dict_: dict[Any, Any]) -> str:
         """
         Utility method to hash a dictionary, can alternatively be used for uuid.
-        :return: A sha256 hash of the json rep of the dictionary.
+
+        Returns:
+            A sha256 hash of the json rep of the dictionary.
         """
         encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
         return hashlib.sha256(encoded).hexdigest()
diff --git a/vllm/compilation/passes/utility/fix_functionalization.py b/vllm/compilation/passes/utility/fix_functionalization.py
index 2887c19ad4a..c0643a916b3 100644
--- a/vllm/compilation/passes/utility/fix_functionalization.py
+++ b/vllm/compilation/passes/utility/fix_functionalization.py
@@ -276,9 +276,11 @@ class FixFunctionalizationPass(VllmInductorPass):
         """
         Replace mutated getitem users of the auto-functionalized node with the
         mutated arguments.
-        :param node: The auto-functionalized node
-        :param mutated_args: The mutated arguments, indexed by getitem index.
-        If the value of an arg is a string, `node.kwargs[arg]` is used.
+
+        Args:
+            node: The auto-functionalized node
+            mutated_args: The mutated arguments, indexed by getitem index.
+                If the value of an arg is a string, `node.kwargs[arg]` is used.
         """
         for idx, user in self.getitem_users(node).items():
             # Some functionalized nodes may return both a result at getitem[0]
@@ -317,10 +319,11 @@ class FixFunctionalizationPass(VllmInductorPass):
         as node.kwargs cannot be used.
         See https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351
 
-        :param graph: Graph to insert the defunctionalized node into
-        :param node: The auto-functionalized node to defunctionalize
-        :param args: If we cannot use kwargs, specify args directly.
-        If an arg is a string, `node.kwargs[arg]` is used.
+        Args:
+            graph: Graph to insert the defunctionalized node into
+            node: The auto-functionalized node to defunctionalize
+            args: If we cannot use kwargs, specify args directly.
+                If an arg is a string, `node.kwargs[arg]` is used.
         """  # noqa: E501
         assert is_func(node, auto_functionalized), (
             f"node must be auto-functionalized, is {node} instead"
diff --git a/vllm/compilation/passes/utility/noop_elimination.py b/vllm/compilation/passes/utility/noop_elimination.py
index 5f7d47ad6f8..80bf8ecc603 100644
--- a/vllm/compilation/passes/utility/noop_elimination.py
+++ b/vllm/compilation/passes/utility/noop_elimination.py
@@ -108,9 +108,13 @@ class NoOpEliminationPass(VllmInductorPass):
     def dims_equivalent(self, dim: int | SymInt, i_dim: int | SymInt) -> bool:
         """
         This function checks if two dimensions are equivalent.
-        :param dim: The dimension arg to reshape/slice
-        :param i_dim: The corresponding dimension in the input tensor
-        :return: Are the dimensions equivalent?
+
+        Args:
+            dim: The dimension arg to reshape/slice
+            i_dim: The corresponding dimension in the input tensor
+
+        Returns:
+            Are the dimensions equivalent?
 
         There are two cases in which the dimensions are equivalent:
         1. The dimensions are equal (both integers)
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 6edd69a949e..a652c5a6c73 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -180,8 +180,9 @@ class CuMemAllocator:
         All data in the memory allocation with the specified tag will be
         offloaded to CPU memory, and others will be discarded.
 
-        :param offload_tags: The tags of the memory allocation that will be
-            offloaded. The rest of the memory allocation will be discarded.
+        Args:
+            offload_tags: The tags of the memory allocation that will be
+                offloaded. The rest of the memory allocation will be discarded.
         """
         if offload_tags is None:
             # by default, allocated tensors are offloaded
@@ -230,9 +231,10 @@ class CuMemAllocator:
         All data that is previously offloaded will be loaded back to GPU
         memory, and the rest of the data will have empty memory.
 
-        :param tags: The tags of the memory allocation that will be loaded
-            back to GPU memory. If None, all memory allocation will be loaded
-            back to GPU memory.
+        Args:
+            tags: The tags of the memory allocation that will be loaded
+                back to GPU memory. If None, all memory allocation will be loaded
+                back to GPU memory.
         """
         for ptr, data in self.pointer_to_data.items():
             if tags is None or data.tag in tags:
@@ -255,8 +257,9 @@ class CuMemAllocator:
         All memory allocation created inside the context will be allocated
         in the memory pool, and has the specified tag.
 
-        :param tag: The tag of the memory allocation. If None, the default tag
-            will be used.
+        Args:
+            tag: The tag of the memory allocation. If None, the default tag
+                will be used.
         """
         if tag is None:
             tag = CuMemAllocator.default_tag
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
index ee21185969f..c9bc8909519 100644
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -132,7 +132,8 @@ class KVEventAggregator:
         """
         Add events from a worker batch.
 
-        :param events: List of KVCacheEvent objects.
+        Args:
+            events: List of KVCacheEvent objects.
         """
         if not isinstance(events, list):
             raise TypeError("events must be a list of KVCacheEvent.")
@@ -142,7 +143,8 @@ class KVEventAggregator:
         """
         Return events that appeared in all workers.
 
-        :return: List of events present in all workers.
+        Returns:
+            List of events present in all workers.
         """
         return [
             event
@@ -154,7 +156,8 @@ class KVEventAggregator:
         """
         Return all events for all workers.
 
-        :return: List of events for all workers.
+        Returns:
+            List of events for all workers.
         """
         return list(self._event_counter.elements())
 
@@ -168,7 +171,8 @@ class KVEventAggregator:
         """
         Increment the number of workers contributing events.
 
-        :param count: Number to increment the workers by.
+        Args:
+            count: Number to increment the workers by.
         """
         if count <= 0:
             raise ValueError("count must be positive.")
@@ -184,7 +188,8 @@ class KVEventAggregator:
         """
         Return the number of workers.
 
-        :return: int number of workers.
+        Returns:
+            int number of workers.
         """
         return self._num_workers
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
index 35cd7060691..d16fbee585a 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -439,13 +439,12 @@ def _init_lmcache_engine(
     `LMCACHE_CONFIG_FILE` to load the configuration file. If that environment
     variable is not set, this function will return None.
 
-    :param lmcache_config: The LMCache configuration.
-    :type lmcache_config: LMCacheEngineConfig
-    :param vllm_config: The vLLM configuration.
-    :type vllm_config: VllmConfig
+    Args:
+        lmcache_config: The LMCache configuration.
+        vllm_config: The vLLM configuration.
 
-    :return: The initialized LMCache engine
-    :rtype: LMCacheEngine
+    Returns:
+        The initialized LMCache engine
     """
     if curr_engine := LMCacheEngineBuilder.get(ENGINE_NAME):
         return curr_engine
diff --git a/vllm/ir/op.py b/vllm/ir/op.py
index 8e82b5d8c7e..742d3f33ff8 100644
--- a/vllm/ir/op.py
+++ b/vllm/ir/op.py
@@ -113,11 +113,14 @@ def register_op(
     """
     Register a new vLLM IR op.
 
-    :param f: the native implementation of the op
-    :param name: the name of the op, defaults to the function name
-    :param activations: list of activation params, defaults to params starting with 'x'
-    :param allow_inplace: add a maybe_inplace overload that allows inplace impls
-    :return: the IrOp object if f is provided, otherwise a decorator
+    Args:
+        f: the native implementation of the op
+        name: the name of the op, defaults to the function name
+        activations: list of activation params, defaults to params starting with 'x'
+        allow_inplace: add a maybe_inplace overload that allows inplace impls
+
+    Returns:
+        the IrOp object if f is provided, otherwise a decorator
 
     Example usage:
     ```python
@@ -245,14 +248,17 @@ class IrOp:
         supported: bool = True,
         supports_args: Callable[..., bool] | None = None,
         inplace: bool = False,
-    ):
+    ) -> Callable[[Callable[..., Any]], "IrOpImpl"]:
         """
         Register an implementation for this custom op.
-        :param provider: The name of the provider, must be unique.
-        :param supported: Static support check, use this to check platform support.
-        :param supports_args: Dynamic arg support check, used for types and shapes.
-        :param inplace: Does this op reuse activation input memory for outputs
-        :return: A decorator that registers the implementation.
+        Args:
+            provider: The name of the provider, must be unique.
+            supported: Static support check, use this to check platform support.
+            supports_args: Dynamic arg support check, used for types and shapes.
+            inplace: Does this op reuse activation input memory for outputs
+
+        Returns:
+            A decorator that registers the implementation.
 
         The decorated function must have the same semantics and signature as
         the native implementation.
diff --git a/vllm/ir/util.py b/vllm/ir/util.py
index ac8a06155da..e9240f487ac 100644
--- a/vllm/ir/util.py
+++ b/vllm/ir/util.py
@@ -12,9 +12,9 @@ from typing import Any
 def hash_source(*srcs: str | Any) -> str:
     """
     Utility method to hash the sources of functions or objects.
-    :param srcs: strings or objects to add to the hash.
-    Objects and functions have their source inspected.
-    :return:
+    Args:
+        srcs: strings or objects to add to the hash.
+            Objects and functions have their source inspected.
     """
     hasher = hashlib.sha256()
     for src in srcs:
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 4ff43ce21b8..2f4401563b6 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -599,12 +599,14 @@ class FusedMoE(PluggableLayer):
     ):
         """
         Load grouped weight scales for group quantization or model weights
-            :param shard_dim: dimension to shard
-            :param expert_data: parameter for a particular expert
-            :param shard_id: either w1, w2, or w3
-            :param loaded_weight: checkpoint weight to load into the param
-            :param tp_rank: tensor parallel rank
-            :param load_full_w2: whether or not the w2 loaded should be sharded.
+
+        Args:
+            shard_dim: dimension to shard
+            expert_data: parameter for a particular expert
+            shard_id: either w1, w2, or w3
+            loaded_weight: checkpoint weight to load into the param
+            tp_rank: tensor parallel rank
+            load_full_w2: whether or not the w2 loaded should be sharded.
         """
         if shard_id == "w2":
             # In the case where we have actorder/g_idx, we do not partition the
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 3c03ff2233b..5b911114d38 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -178,8 +178,9 @@ class QuantizationConfig(ABC):
         Interface for models to update module names referenced in
         quantization configs in order to reflect the vllm model structure
 
-        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
-            structure of the qconfig) to vllm model structure
+        Args:
+            hf_to_vllm_mapper: maps from hf model structure (the assumed
+                structure of the qconfig) to vllm model structure
         """
         # TODO (@kylesayrs): add implementations for all subclasses
         pass
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b59e12e8e1b..f4bd57e10e6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -267,8 +267,11 @@ class CompressedTensorsConfig(QuantizationConfig):
         cls, config: dict[str, Any]
     ) -> tuple[dict[str, SparsityCompressionConfig], list[str]]:
         """
-        :param config: The `quantization_config` dictionary from config.json
-        :return: A tuple with two elements
+        Args:
+            config: The `quantization_config` dictionary from config.json
+
+        Returns:
+            A tuple with two elements
             1. A dictionary mapping target layer names to their corresponding
                 sparsity_config
             2. A list of layer names to ignore for sparsity
@@ -296,8 +299,11 @@ class CompressedTensorsConfig(QuantizationConfig):
         cls, config: dict[str, Any]
     ) -> QUANTIZATION_SCHEME_MAP_TYPE:
         """
-        :param config: The `quantization_config` dictionary from config.json
-        :return: A dictionary mapping target layer names to their corresponding
+        Args:
+            config: The `quantization_config` dictionary from config.json
+
+        Returns:
+            A dictionary mapping target layer names to their corresponding
             quantization_args for weights and input activations
         """
         target_scheme_map: dict[str, Any] = dict()
@@ -967,7 +973,9 @@ class CompressedTensorsKVCacheMethod(BaseKVCacheMethod):
         """
         Validator for the kv cache scheme. Useful for controlling the
         kv cache quantization schemes, that are being supported in vLLM
-        :param kv_cache_scheme: the compressed-tensors kv cache scheme
+
+        Args:
+            kv_cache_scheme: the compressed-tensors kv cache scheme
         """
         if kv_cache_scheme is None:
             return
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index 731cba1ba2a..78419a0dd98 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -38,11 +38,11 @@ class CompressedTensorsScheme(ABC):
         Run the forward pass for the particular scheme. This is where
         scheme-specific dequant/quant steps/kernels should be applied.
 
-        :param layer: torch.nn.Module with the registered weights and
-            other parameters relevant to the particular scheme.
-        :param x: input to the layer
-        :param bias: bias parameter
-
+        Args:
+            layer: torch.nn.Module with the registered weights and
+                other parameters relevant to the particular scheme.
+            x: input to the layer
+            bias: bias parameter
         """
         raise NotImplementedError()
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index def4797b139..afb899cd6d7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -133,12 +133,11 @@ def find_matched_target(
         *All* component module names must match in order for a match to be
         successful. A successful match returns the first component target
 
-    :param layer_name: layer name
-    :param module: torch.nn.Module
-    :param targets: list of targets to match the layer against
-    :param fused_mapping: map from fused layer names to its components
-    :param fused_strategy: either "all" or "any". If using "all", fused
-        layers match if "all" of its components match
+    Args:
+        layer_name: layer name
+        module: torch.nn.Module
+        targets: list of targets to match the layer against
+        fused_mapping: map from fused layer names to its components
     """
 
     if layer_name is None:
@@ -161,9 +160,10 @@ def _find_first_match(
     exactly or as a regex after 're:'. If check_contains is set to True,
     additionally checks if the target string is contained within the value.
 
-    :param value: string to compare the list of targets against
-    :param targets: list of targets to match the layer against
-    :param check_contains: whether or not to do a substring match
+    Args:
+        value: string to compare the list of targets against
+        targets: list of targets to match the layer against
+        check_contains: whether or not to do a substring match
     """
 
     for target in targets:
@@ -205,9 +205,10 @@ def _match_fused_layer(
     Implements an "all" matching strategy where a fused layer matches iff
     "all" of its components match
 
-    :param layer_name: layer name
-    :param target_layers: list of targets to match the layer against
-    :param fused_mapping: map from fused layer names to its components
+    Args:
+        layer_name: layer name
+        target_layers: list of targets to match the layer against
+        fused_mapping: map from fused layer names to its components
 
     Examples:
         layer_name = "model.layers.0.self_attn.qkv_proj"
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index dca49d7ed97..7458b70ea81 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -114,8 +114,9 @@ class GGUFConfig(QuantizationConfig):
         Interface for models to update module names referenced in
         quantization configs in order to reflect the vllm model structure
 
-        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
-            structure of the qconfig) to vllm model structure
+        Args:
+            hf_to_vllm_mapper: maps from hf model structure (the assumed
+                structure of the qconfig) to vllm model structure
         """
         if self.unquantized_modules is not None:
             self.unquantized_modules = hf_to_vllm_mapper.apply_list(
diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py
index d7fa6cf2633..e8810919c20 100644
--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -46,16 +46,17 @@ class QuantFP8(CustomOp):
         compile_native: bool = True,
     ):
         """
-        :param static: static or dynamic quantization
-        :param group_shape: quantization group shape (PER_TOKEN, PER_TENSOR,
-            PER_CHANNEL, or arbitrary block size)
-        :param num_token_padding: Pad the token dimension of output to this
-            size
-        :param tma_aligned_scales: For group quantization, output scales in
-            TMA-aligned layout
-        :param column_major_scales: For group quantization, output scales in
-            column major format
-        :param compile_native: Manually compile forward_native if compile mode > None
+        Args:
+            static: static or dynamic quantization
+            group_shape: quantization group shape (PER_TOKEN, PER_TENSOR,
+                PER_CHANNEL, or arbitrary block size)
+            num_token_padding: Pad the token dimension of output to this
+                size
+            tma_aligned_scales: For group quantization, output scales in
+                TMA-aligned layout
+            column_major_scales: For group quantization, output scales in
+                column major format
+            compile_native: Manually compile forward_native if compile mode > None
         """
         super().__init__(compile_native=compile_native)
         self.static = static
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index 739de9236cf..100632686b0 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -47,7 +47,8 @@ class BaseKVCacheMethod(QuantizeMethodBase):
         - quantize k/v_cache entries before saving them to the cache
         - dequantize k/v_cache entries before fetching them from the cache
 
-    :param quant_config: the appropriate QuantizationConfig
+    Args:
+        quant_config: the appropriate QuantizationConfig
     """
 
     def __init__(self, quant_config: QuantizationConfig):
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 54dea48973b..424fdf2fba0 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -87,8 +87,9 @@ class QuarkConfig(QuantizationConfig):
         Interface for models to update module names referenced in
         quantization configs in order to reflect the vllm model structure
 
-        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
-            structure of the qconfig) to vllm model structure
+        Args:
+            hf_to_vllm_mapper: maps from hf model structure (the assumed
+                structure of the qconfig) to vllm model structure
         """
         quant_config_with_hf_to_vllm_mapper: dict[str, Any] = {}
 
@@ -724,7 +725,9 @@ class QuarkKVCacheMethod(BaseKVCacheMethod):
         """
         Validator for the kv cache configuration. Useful for controlling the
         kv cache quantization schemes, that are being supported in vLLM
-        :param kv_cache_config: the quark kv cache scheme
+
+        Args:
+            kv_cache_config: the quark kv cache scheme
         """
         if kv_cache_config is None:
             return
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
index 412a07a85fe..6f8db9ea57d 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
@@ -38,11 +38,11 @@ class QuarkScheme(ABC):
         Run the forward pass for the particular scheme. This is where
         scheme-specific dequant/quant steps/kernels should be applied.
 
-        :param layer: torch.nn.Module with the registered weights and
-            other parameters relevant to the particular scheme.
-        :param x: input to the layer
-        :param bias: bias parameter
-
+        Args:
+            layer: torch.nn.Module with the registered weights and
+                other parameters relevant to the particular scheme.
+            x: input to the layer
+            bias: bias parameter
         """
         raise NotImplementedError
 
diff --git a/vllm/model_executor/model_loader/reload/layerwise.py b/vllm/model_executor/model_loader/reload/layerwise.py
index 40dd6dc9f39..6cf1c19cba4 100644
--- a/vllm/model_executor/model_loader/reload/layerwise.py
+++ b/vllm/model_executor/model_loader/reload/layerwise.py
@@ -123,7 +123,8 @@ def initialize_online_processing(layer: torch.nn.Module):
     Called by either `initialize_layerwise_reload` or an online quantization scheme,
     prevents double wrapping in the case of online quantization + reloading
 
-    :param layer: layer whose parameter weight loaders will be wrapped
+    Args:
+        layer: layer whose parameter weight loaders will be wrapped
     """
     info = get_layerwise_info(layer)
 
@@ -222,8 +223,9 @@ def finalize_layerwise_processing(model: torch.nn.Module, model_config: ModelCon
     This function should be applied after `initialize_layerwise_reload` is applied
     unwrap the layerwise weight loaders.
 
-    :param model: model to finalize processing for
-    :param model_config: config needed for applying processing to attention layers
+    Args:
+        model: model to finalize processing for
+        model_config: config needed for applying processing to attention layers
     """
     if hasattr(model, "_original_do_torchao_reload"):
         model._do_torchao_reload = model._original_do_torchao_reload
diff --git a/vllm/model_executor/model_loader/reload/meta.py b/vllm/model_executor/model_loader/reload/meta.py
index 397a458cbdd..283a98de284 100644
--- a/vllm/model_executor/model_loader/reload/meta.py
+++ b/vllm/model_executor/model_loader/reload/meta.py
@@ -175,9 +175,12 @@ def get_numel_loaded(
     """
     Determine how many elements would be loaded by a weight loader call.
 
-    :param weight loader: used to load weights
-    :param args: bound arguments to weight loader
-    :return: number of elements loaded by the weight loader, the return value of the
+    Args:
+        weight_loader: used to load weights
+        args: bound arguments to weight loader
+
+    Returns:
+        number of elements loaded by the weight loader, the return value of the
         weight loader
     """
     with CopyCounter() as counter:
diff --git a/vllm/model_executor/model_loader/reload/sanitize.py b/vllm/model_executor/model_loader/reload/sanitize.py
index 2a6dc7182d0..21c47a2257f 100644
--- a/vllm/model_executor/model_loader/reload/sanitize.py
+++ b/vllm/model_executor/model_loader/reload/sanitize.py
@@ -20,9 +20,12 @@ def sanitize_layer_refs(tensor: torch.Tensor, layer: torch.nn.Module) -> torch.T
     tensors will reference layers, and the WeakKeyDictionary will never evict entries,
     even when the model is deleted.
 
-    :param tensor: tensor to be sanitized
-    :param layer: layer whose references should be removed
-    :return: sanitized tensor
+    Args:
+        tensor: tensor to be sanitized
+        layer: layer whose references should be removed
+
+    Returns:
+        sanitized tensor
     """
     for key, value in tensor.__dict__.items():
         if isinstance(value, MethodType) and value.__self__ is layer:
@@ -38,10 +41,12 @@ def restore_layer_refs(tensor: torch.Tensor, layer: torch.nn.Module) -> torch.Te
     Used by `restore_layer_on_meta` to add back layer references, allowing for proper
     weight loading.
 
-    :param tensor: tensor to be sanitized
-    :param layer: layer whose references should be removed
-    :return: sanitized tensor
+    Args:
+        tensor: tensor to be sanitized
+        layer: layer whose references should be removed
 
+    Returns:
+        sanitized tensor
     """
     for key, value in tensor.__dict__.items():
         if isinstance(value, MethodType) and value.__self__ is layer_ref_sentinel:
diff --git a/vllm/model_executor/model_loader/reload/utils.py b/vllm/model_executor/model_loader/reload/utils.py
index 7a3d6873e10..f0078d0f9d8 100644
--- a/vllm/model_executor/model_loader/reload/utils.py
+++ b/vllm/model_executor/model_loader/reload/utils.py
@@ -49,8 +49,11 @@ def has_device_tensors(bound_args: BoundArguments) -> bool:
     """
     Return True if the loaded weights exist on an accelerator device
 
-    :param bound_args: args to load weights
-    :return: True if weights are on accelerator device
+    Args:
+        bound_args: args to load weights
+
+    Returns:
+        True if weights are on accelerator device
     """
     return any(
         isinstance(value, torch.Tensor) and value.device.type not in ("meta", "cpu")
@@ -62,8 +65,11 @@ def get_info_size(info: LayerReloadingInfo) -> int:
     """
     Calculate the number of bytes used by loaded weights for a given layer
 
-    :param info: layerwise info to get size of
-    :return: number of bytes used by loaded weights
+    Args:
+        info: layerwise info to get size of
+
+    Returns:
+        number of bytes used by loaded weights
     """
     return sum(
         value.nbytes
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 4491a6a3ea1..541f60c2c40 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -277,7 +277,8 @@ class OlmoModel(nn.Module):
         inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor | IntermediateTensors:
         """
-        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        Args:
+            input_ids: A tensor of shape `(batch_size, seq_len)`.
         """
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 212140fe15e..ad04b258bde 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -314,7 +314,8 @@ class Olmo2Model(nn.Module):
         inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor | IntermediateTensors:
         """
-        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        Args:
+            input_ids: A tensor of shape `(batch_size, seq_len)`.
         """
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 4106672d501..7f96ceda09d 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -3,6 +3,7 @@
 
 from collections.abc import Callable, Hashable
 from fractions import Fraction
+from typing import Any
 from weakref import WeakValueDictionary
 
 import torch
@@ -42,10 +43,9 @@ class BasevLLMParameter(Parameter):
         """
         Initialize the BasevLLMParameter
 
-        :param data: torch tensor with the parameter data
-        :param weight_loader: weight loader callable
-
-        :returns: a torch.nn.parameter
+        Args:
+            data: torch tensor with the parameter data
+            weight_loader: weight loader callable
         """
 
         # During weight loading, we often do something like:
@@ -445,15 +445,16 @@ class SharedWeightParameter(BasevLLMParameter):
                 "currently support tensor parallelism"
             )
 
-    def add_partition(self, index: int, data_key: Hashable, *args, **kwargs):
+    def add_partition(self, index: int, data_key: Hashable, *args: Any, **kwargs: Any):
         """
         Add a partition to the weight parameter. Partitions whose `data_key`
         is the same will share tensor data
 
-        :param index: index of partition to add
-        :param data_key: hashable key used to key shared tensors
-        :param *args: arguments for `torch.empty`
-        :param **kwargs: keyword arguments for `torch.empty`
+        Args:
+            index: index of partition to add
+            data_key: hashable key used to key shared tensors
+            *args: arguments for `torch.empty`
+            **kwargs: keyword arguments for `torch.empty`
         """
         # load (shared) tensor using `data_key`
         if data_key not in self.tensors_registry:
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 04def3e3769..cd215421a98 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -84,8 +84,11 @@ def maybe_model_redirect(model: str) -> str:
     """
     Use model_redirect to redirect the model name to a local folder.
 
-    :param model: hf model name
-    :return: maybe redirect to a local folder
+    Args:
+        model: hf model name
+
+    Returns:
+        maybe redirect to a local folder
     """
 
     model_redirect_path = envs.VLLM_MODEL_REDIRECT_PATH
diff --git a/vllm/v1/attention/backend.py b/vllm/v1/attention/backend.py
index af58bfd31a5..4b4a4435b31 100644
--- a/vllm/v1/attention/backend.py
+++ b/vllm/v1/attention/backend.py
@@ -808,14 +808,17 @@ class AttentionImpl(AttentionImplBase[T], Generic[T]):
     ) -> torch.Tensor:
         raise NotImplementedError
 
-    def fused_output_quant_supported(self, quant_key: "QuantKey"):
+    def fused_output_quant_supported(self, quant_key: "QuantKey") -> bool:
         """
         Does this attention implementation support fused output quantization.
         This is used by the AttnFusionPass to only fuse output quantization
         onto implementations that support it.
 
-        :param quant_key: QuantKey object that describes the quantization op
-        :return: is fusion supported for this type of quantization
+        Args:
+            quant_key: QuantKey object that describes the quantization op
+
+        Returns:
+            is fusion supported for this type of quantization
         """
         return False
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 462375644ba..801a8574ac7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1878,9 +1878,8 @@ class GPUModelRunner(
         SpecDecodeMetadata | None,
     ]:
         """
-        :return: tuple[
-            logits_indices, spec_decode_metadata,
-        ]
+        Returns:
+            tuple[logits_indices, spec_decode_metadata]
         """
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
@@ -2205,7 +2204,8 @@ class GPUModelRunner(
         slot_mappings: dict[int, torch.Tensor] | None = None,
     ) -> tuple[PerLayerAttnMetadata, CommonAttentionMetadata | None]:
         """
-        :return: tuple[attn_metadata, spec_decode_common_attn_metadata]
+        Returns:
+            tuple[attn_metadata, spec_decode_common_attn_metadata]
         """
         # Attention metadata is not needed for attention free models
         if len(self.kv_cache_config.kv_cache_groups) == 0:
@@ -2503,9 +2503,11 @@ class GPUModelRunner(
         num_common_prefix_blocks: list[int],
     ) -> list[list[int]] | None:
         """
-        :return: Optional[cascade_attn_prefix_lens]
-            cascade_attn_prefix_lens is 2D: ``[kv_cache_group_id][attn_group_idx]``,
-            None if we should not use cascade attention
+        Returns:
+            Optional[cascade_attn_prefix_lens]
+                cascade_attn_prefix_lens is 2D:
+                ``[kv_cache_group_id][attn_group_idx]``,
+                None if we should not use cascade attention
         """
 
         use_cascade_attn = False
@@ -5324,11 +5326,12 @@ class GPUModelRunner(
         """
         Reload weights from a weights iterator or from disk
 
-        :param weights_iterator: weights to load into model
-        :param weights_path: path to load weights from if weights_iterator is not
-            provided. Use path of original model if neither is provided.
-        :param is_checkpoint_format: set to False if weights have already been processed
-            into kernel format (repacking, renaming, etc.)
+        Args:
+            weights_iterator: weights to load into model
+            weights_path: path to load weights from if weights_iterator is not
+                provided. Use path of original model if neither is provided.
+            is_checkpoint_format: set to False if weights have already been
+                processed into kernel format (repacking, renaming, etc.)
         """
         # TODO(@kylesayrs): generalize to all runners and loaders
         # argument validation