mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
213f10bfdd
Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
377 lines
14 KiB
Python
377 lines
14 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""Tests for vllm.compilation.codegen — execution code generation.
|
|
|
|
Each test runs a real Python function through the same pipeline vLLM uses
|
|
in production: ``make_fx`` to obtain an aten-level fx graph, ``split_graph``
|
|
to split it into the stitching layer + submodules, and then
|
|
``generate_execution_code``/``compile_execution_fn`` for codegen.
|
|
"""
|
|
|
|
from collections.abc import Callable
|
|
|
|
import pytest
|
|
import regex as re
|
|
import torch
|
|
import torch.fx as fx
|
|
from torch.fx.experimental.proxy_tensor import make_fx
|
|
|
|
from vllm.compilation.backends import split_graph
|
|
from vllm.compilation.codegen import (
|
|
_node_ref,
|
|
compile_execution_fn,
|
|
generate_execution_code,
|
|
generate_execution_code_with_name,
|
|
)
|
|
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
|
|
|
|
|
def _trace_and_split(
|
|
model_fn: Callable[..., torch.Tensor],
|
|
example_inputs: tuple[torch.Tensor, ...],
|
|
split_ops: list[str],
|
|
) -> fx.GraphModule:
|
|
"""Trace ``model_fn`` with make_fx, then split on the named aten ops."""
|
|
gm = make_fx(model_fn)(*example_inputs)
|
|
split_gm, _ = split_graph(gm, split_ops)
|
|
return split_gm
|
|
|
|
|
|
def _to_copy_model(x: torch.Tensor) -> torch.Tensor:
|
|
"""Traces to ``aten._to_copy.default`` with device + dtype kwargs."""
|
|
return x.to(device=torch.device("cpu"), dtype=torch.float16)
|
|
|
|
|
|
def _empty_model(x: torch.Tensor) -> torch.Tensor:
|
|
"""Traces to ``aten.empty.memory_format`` with device + dtype kwargs."""
|
|
buf = torch.empty(x.shape, device=torch.device("cpu"), dtype=torch.float16)
|
|
return buf.fill_(0).add(x.to(dtype=torch.float16))
|
|
|
|
|
|
@pytest.fixture
|
|
def x() -> torch.Tensor:
|
|
return torch.zeros(2, 3)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"model_fn,split_ops",
|
|
[
|
|
(_to_copy_model, ["aten::_to_copy.default"]),
|
|
(_empty_model, []),
|
|
],
|
|
ids=["aten::_to_copy.default", "aten::empty.memory_format"],
|
|
)
|
|
def test_non_primitive_kwargs_lifted_to_consts(
|
|
model_fn: Callable[[torch.Tensor], torch.Tensor],
|
|
split_ops: list[str],
|
|
x: torch.Tensor,
|
|
) -> None:
|
|
"""Regression: arguments whose ``repr()`` is not a valid Python
|
|
expression in the generated function's namespace (notably
|
|
``torch.device``) used to be inlined via ``repr()``, producing source
|
|
like
|
|
|
|
out = torch.ops.aten._to_copy.default(x, device=device(type='cpu'))
|
|
|
|
which fails at call time — only ``torch`` and ``operator`` are imported
|
|
into the namespace, so ``device`` is unbound. The fix collects such
|
|
objects into ``__vllm_consts__`` and references them by index. The
|
|
unqualified ``device(type=...)`` form must never appear in the
|
|
generated source."""
|
|
split_gm = _trace_and_split(model_fn, (x,), split_ops)
|
|
code, submod_names, consts = generate_execution_code(split_gm)
|
|
|
|
assert "device(type=" not in code, (
|
|
"Generated code contains unqualified `device(type=...)` from repr(); "
|
|
"torch.device should be lifted into __vllm_consts__"
|
|
)
|
|
assert torch.device("cpu") in consts, "torch.device kwarg not lifted to consts"
|
|
assert torch.float16 in consts, "torch.dtype kwarg not lifted to consts"
|
|
|
|
fn = compile_execution_fn(code, {}, submod_names, consts)
|
|
out = fn(x)
|
|
expected = model_fn(x)
|
|
assert torch.equal(out, expected), "Compiled output does not match reference"
|
|
|
|
|
|
def test_dtype_singleton_deduped(x: torch.Tensor) -> None:
|
|
"""``torch.float16`` is a process-wide singleton, so two ops referring
|
|
to it in the traced graph share a single consts slot via ``id()``-based
|
|
dedup. Distinct expressions (``x.to(...)`` vs ``(x*2).to(...)``) ensure
|
|
the tracer can't CSE the two ops into a single node."""
|
|
|
|
def model_fn(x: torch.Tensor) -> torch.Tensor:
|
|
return x.to(dtype=torch.float16) + (x * 2).to(dtype=torch.float16)
|
|
|
|
split_gm = _trace_and_split(model_fn, (x,), [])
|
|
code, submod_names, consts = generate_execution_code(split_gm)
|
|
|
|
# The traced graph must have two distinct _to_copy nodes (otherwise the
|
|
# dedup assertion below is trivially satisfied).
|
|
n_to_copy = sum(
|
|
1
|
|
for n in split_gm.graph.nodes
|
|
if n.op == "call_module"
|
|
for sn in getattr(split_gm, n.target).graph.nodes
|
|
if sn.op == "call_function" and "to_copy" in sn.name
|
|
)
|
|
assert n_to_copy >= 2, (
|
|
f"Test setup failed: expected ≥2 _to_copy nodes, got {n_to_copy}"
|
|
)
|
|
|
|
assert consts.count(torch.float16) == 1, (
|
|
f"torch.float16 should occupy exactly one slot, got consts={consts}"
|
|
)
|
|
assert code.count("__vllm_consts__[0]") >= 2, (
|
|
"Deduped const slot should be referenced from both _to_copy nodes"
|
|
)
|
|
|
|
fn = compile_execution_fn(code, {}, submod_names, consts)
|
|
assert torch.equal(fn(x), model_fn(x))
|
|
|
|
|
|
def test_distinct_dtypes_get_distinct_slots(x: torch.Tensor) -> None:
|
|
"""Distinct dtype singletons in the traced graph occupy distinct slots."""
|
|
|
|
def model_fn(x: torch.Tensor) -> torch.Tensor:
|
|
return x.to(dtype=torch.float16) + x.to(dtype=torch.bfloat16)
|
|
|
|
split_gm = _trace_and_split(model_fn, (x,), [])
|
|
_, _, consts = generate_execution_code(split_gm)
|
|
|
|
assert torch.float16 in consts
|
|
assert torch.bfloat16 in consts
|
|
assert len(consts) == 2, f"Expected 2 distinct dtype slots, got {consts}"
|
|
|
|
|
|
def test_consts_ordering_deterministic(x: torch.Tensor) -> None:
|
|
"""Two independent traces of the same model must produce equal consts
|
|
lists *in the same order*. Cache artifacts identify const slots by
|
|
index, so a non-deterministic order would invalidate cached code."""
|
|
|
|
def model_fn(x: torch.Tensor) -> torch.Tensor:
|
|
# Multiple distinct non-primitives encountered in a fixed graph order.
|
|
a = x.to(device=torch.device("cpu"), dtype=torch.float16)
|
|
return a.to(dtype=torch.bfloat16)
|
|
|
|
_, _, consts1 = generate_execution_code(_trace_and_split(model_fn, (x,), []))
|
|
_, _, consts2 = generate_execution_code(_trace_and_split(model_fn, (x,), []))
|
|
|
|
assert len(consts1) >= 2, "Test setup: model should produce ≥2 const slots"
|
|
assert consts1 == consts2, (
|
|
f"consts ordering must be reproducible across traces; "
|
|
f"got {consts1} vs {consts2}"
|
|
)
|
|
|
|
|
|
def test_primitive_args_inlined(x: torch.Tensor) -> None:
|
|
"""Primitive args (int dim, etc.) stay inline as repr — no consts."""
|
|
|
|
def model_fn(x: torch.Tensor) -> torch.Tensor:
|
|
return torch.transpose(x, 0, 1).relu()
|
|
|
|
split_gm = _trace_and_split(model_fn, (x,), [])
|
|
code, submod_names, consts = generate_execution_code(split_gm)
|
|
|
|
assert consts == [], "Primitive-only graph must produce empty consts"
|
|
|
|
fn = compile_execution_fn(code, {}, submod_names, consts)
|
|
assert torch.equal(fn(x), model_fn(x))
|
|
|
|
|
|
def test_consts_shared_across_split_submods(x: torch.Tensor) -> None:
|
|
"""Dedup must apply across inlined submodules, not just within one.
|
|
|
|
The function below splits into three inlined submods, two of which
|
|
independently reference ``torch.float16``. The shared ``const_index``
|
|
threaded through recursive ``generate_execution_code_with_name`` calls
|
|
must collapse the dtype to a single slot used from both submods."""
|
|
|
|
def model_fn(x: torch.Tensor) -> torch.Tensor:
|
|
a = x.to(dtype=torch.float16) # submod_0: _to_copy(fp16)
|
|
b = a.relu() # submod_1: relu (split point)
|
|
c = b.to(dtype=torch.float32) # submod_2: _to_copy(fp32)
|
|
return c.to(dtype=torch.float16) + 1 # submod_2: another _to_copy(fp16)
|
|
|
|
split_gm = _trace_and_split(model_fn, (x,), ["aten::relu.default"])
|
|
|
|
n_submods = sum(1 for _ in split_gm.named_children())
|
|
assert n_submods >= 3, (
|
|
f"Test setup failed: expected ≥3 submods after split, got {n_submods}"
|
|
)
|
|
|
|
code, submod_names, consts = generate_execution_code(split_gm)
|
|
|
|
assert consts.count(torch.float16) == 1, (
|
|
f"fp16 singleton must dedup across submods, got consts={consts}"
|
|
)
|
|
|
|
# Find the consts index for fp16 and confirm at least two distinct
|
|
# inlined submods reference it. This rules out the false-positive where
|
|
# one submod references it twice and the other not at all.
|
|
fp16_idx = consts.index(torch.float16)
|
|
submod_bodies = re.findall(
|
|
r"def __vllm_inlined_submods__(\d+)\([^)]*\):\n((?: .*\n)+)", code
|
|
)
|
|
assert len(submod_bodies) >= 2
|
|
referencing_submods = [
|
|
name for name, body in submod_bodies if f"__vllm_consts__[{fp16_idx}]" in body
|
|
]
|
|
assert len(referencing_submods) >= 2, (
|
|
f"fp16 slot should be referenced from ≥2 inlined submods, "
|
|
f"got {referencing_submods}"
|
|
)
|
|
|
|
fn = compile_execution_fn(code, {}, submod_names, consts)
|
|
assert torch.equal(fn(x), model_fn(x))
|
|
|
|
|
|
def test_non_graphmodule_submod_uses_indexed_callable(x: torch.Tensor) -> None:
|
|
"""When a child of split_gm is *not* a ``torch.fx.GraphModule`` — as
|
|
happens in production once ``PiecewiseBackend`` replaces submods —
|
|
codegen emits ``__vllm_submods__[idx](...)`` instead of inlining, and
|
|
the runtime callable is bound from ``submod_callables``."""
|
|
|
|
def model_fn(x: torch.Tensor) -> torch.Tensor:
|
|
return x.relu().sigmoid()
|
|
|
|
split_gm = _trace_and_split(model_fn, (x,), ["aten::relu.default"])
|
|
|
|
# Find a GraphModule child and wrap it in a non-GraphModule nn.Module
|
|
# that delegates to the original — this is the structural shape vLLM
|
|
# produces after PiecewiseBackend takes over a submod.
|
|
child_names = [name for name, _ in split_gm.named_children()]
|
|
target_name = child_names[0]
|
|
|
|
class NonGMWrapper(torch.nn.Module):
|
|
def __init__(self, gm: fx.GraphModule) -> None:
|
|
super().__init__()
|
|
self.gm = gm
|
|
|
|
def forward(self, *args, **kwargs):
|
|
return self.gm(*args, **kwargs)
|
|
|
|
original = getattr(split_gm, target_name)
|
|
del split_gm._modules[target_name]
|
|
split_gm.add_module(target_name, NonGMWrapper(original))
|
|
|
|
code, submod_names, consts = generate_execution_code(split_gm)
|
|
|
|
assert "__vllm_submods__[" in code, (
|
|
"Non-GraphModule submod should produce an indexed callable reference"
|
|
)
|
|
assert target_name in submod_names
|
|
|
|
submod_callables = {
|
|
name: getattr(split_gm, name)
|
|
for name in submod_names
|
|
if not isinstance(getattr(split_gm, name), fx.GraphModule)
|
|
}
|
|
fn = compile_execution_fn(code, submod_callables, submod_names, consts)
|
|
assert torch.equal(fn(x), model_fn(x))
|
|
|
|
|
|
# split_graph only passes tuple_return=True to split_module on PyTorch >= 2.12,
|
|
# so getitem nodes only appear in the stitching graph from that version onward.
|
|
@pytest.mark.skipif(
|
|
not is_torch_equal_or_newer("2.12.0.dev"),
|
|
reason="split_module tuple_return requires PyTorch >= 2.12",
|
|
)
|
|
def test_getitem_in_stitching_graph(x: torch.Tensor) -> None:
|
|
"""``operator.getitem`` on submod tuple returns is the ``call_function``
|
|
special case at codegen.py — emitted as ``name = source[index]``
|
|
rather than a function call."""
|
|
|
|
def model_fn(x: torch.Tensor) -> torch.Tensor:
|
|
return x.relu().sigmoid()
|
|
|
|
split_gm = _trace_and_split(model_fn, (x,), ["aten::relu.default"])
|
|
code, _, _ = generate_execution_code(split_gm)
|
|
|
|
# split_module wraps each submod return in a tuple, so the stitching
|
|
# graph unpacks via getitem. The codegen must emit it as indexing.
|
|
assert re.search(r"\b\w+ = \w+\[\d+\]\n", code), (
|
|
"Stitching graph should emit `name = source[N]` for getitem nodes"
|
|
)
|
|
|
|
|
|
def test_del_emitted_for_intermediate_values(x: torch.Tensor) -> None:
|
|
"""The codegen schedules ``del`` after a value's last use to free
|
|
memory early. Multi-submod splits naturally have intermediates whose
|
|
last use is not the output node."""
|
|
|
|
def model_fn(x: torch.Tensor) -> torch.Tensor:
|
|
return x.relu().sigmoid().tanh()
|
|
|
|
split_gm = _trace_and_split(
|
|
model_fn, (x,), ["aten::relu.default", "aten::sigmoid.default"]
|
|
)
|
|
code, _, _ = generate_execution_code(split_gm)
|
|
|
|
assert re.search(r"^ del \w+", code, re.MULTILINE), (
|
|
"Liveness analysis should emit `del` for intermediates with "
|
|
"last-use before the output"
|
|
)
|
|
|
|
|
|
def test_with_submod_false_rejects_call_module() -> None:
|
|
"""``generate_execution_code_with_name(with_submod=False)`` is the
|
|
recursive entry for inlining a GraphModule into its parent. It must
|
|
refuse a graph that itself contains ``call_module`` nodes — the parent
|
|
is responsible for handling those."""
|
|
g = fx.Graph()
|
|
x_node = g.placeholder("x")
|
|
root = torch.nn.Module()
|
|
root.add_module("inner", torch.nn.Identity())
|
|
call = g.call_module("inner", args=(x_node,))
|
|
g.output(call)
|
|
gm = fx.GraphModule(root, g)
|
|
|
|
with pytest.raises(RuntimeError, match="call_module is not allowed"):
|
|
generate_execution_code_with_name(gm, "f", with_submod=False)
|
|
|
|
|
|
def test_node_ref_recurses_through_containers() -> None:
|
|
"""``_node_ref`` is the recursive walker that lifts non-primitives
|
|
nested inside list/tuple/dict args. Real aten ops rarely produce such
|
|
structures, but the path is needed for DTensor placement lists and
|
|
other future cases — unit-test the walker directly."""
|
|
consts: list = []
|
|
const_index: dict[int, int] = {}
|
|
cpu = torch.device("cpu")
|
|
|
|
# Non-primitive in a list, primitive alongside.
|
|
assert _node_ref([cpu, 1], consts, const_index) == "[__vllm_consts__[0], 1]"
|
|
assert consts == [cpu]
|
|
|
|
# Same object in a tuple — id-based dedup reuses the existing slot.
|
|
assert _node_ref((cpu, 2), consts, const_index) == "(__vllm_consts__[0], 2)"
|
|
assert consts == [cpu]
|
|
|
|
# Single-element tuple uses the trailing-comma form.
|
|
assert _node_ref((cpu,), consts, const_index) == "(__vllm_consts__[0],)"
|
|
|
|
# Dict value lifts the same way.
|
|
ref = _node_ref({"k": cpu}, consts, const_index)
|
|
assert ref == "{'k': __vllm_consts__[0]}"
|
|
|
|
|
|
def test_legacy_code_without_consts() -> None:
|
|
"""``compile_execution_fn(consts=None)`` must still load code that has
|
|
no ``__vllm_consts__`` reference, so older serialized cache artifacts
|
|
keep working."""
|
|
# Pre-consts codegen: no __vllm_consts__ reference, only torch/operator.
|
|
legacy_code = (
|
|
"import torch\n"
|
|
"def execution_fn(x, *, __vllm_submods__):\n"
|
|
" return __vllm_submods__[0](x) + 1\n"
|
|
)
|
|
|
|
class AddOne(torch.nn.Module):
|
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
return x + 1
|
|
|
|
fn = compile_execution_fn(legacy_code, {"sub": AddOne()}, ["sub"], consts=None)
|
|
out = fn(torch.zeros(3))
|
|
assert torch.equal(out, torch.full((3,), 2.0))
|