[MRV2] Also enable MRV2 for Llama and Mistral dense models (#43458)

Signed-off-by: Nick Hill <nickhill123@gmail.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
Nick Hill
2026-06-02 11:18:46 -07:00
committed by GitHub
parent ed9a7526b6
commit da107a59e5
5 changed files with 58 additions and 8 deletions
+20
View File
@@ -90,6 +90,26 @@ def test_v2_model_runner_env_tri_state(monkeypatch, env_value, expected):
),
True,
),
(
SimpleNamespace(
model="meta-llama/Llama-3.2-1B",
architectures=["LlamaForCausalLM"],
runner_type="generate",
is_moe=False,
is_quantized=False,
),
True,
),
(
SimpleNamespace(
model="mistralai/Mistral-7B-v0.1",
architectures=["MistralForCausalLM"],
runner_type="generate",
is_moe=False,
is_quantized=False,
),
True,
),
(
SimpleNamespace(
model="facebook/opt-125m",
+24 -7
View File
@@ -184,14 +184,31 @@ async def test_abort_during_final_step(async_scheduling: bool):
original_execute_model = Worker.execute_model
def execute_model_with_wait(self, scheduler_output):
# Signal that execute_model has been called by deleting ready_file
if ready_file.exists():
ready_file.unlink()
# V2's `gpu_worker.compile_or_warm_up_model` calls
# `warmup_kernels(...)` during engine init, which itself calls
# `Worker.execute_model` three times (prefill / decode / cleanup)
# to JIT compile triton kernels. None of those carry the test's
# request id, so we only stall when our actual request is being
# processed.
scheduled = scheduler_output.num_scheduled_tokens or {}
finished = scheduler_output.finished_req_ids or set()
# Wait for the block file to be deleted (triggered from test after abort)
# This runs in the worker process (after fork), so we poll the filesystem
while block_file.exists():
time.sleep(0.01)
def is_target_request(req_ids):
return any(
rid == request_id or rid.startswith(f"{request_id}-")
for rid in req_ids
)
if is_target_request(scheduled) or is_target_request(finished):
# Signal that execute_model has been called by deleting ready_file
if ready_file.exists():
ready_file.unlink()
# Wait for the block file to be deleted (triggered from test after
# abort). This runs in the worker process (after fork), so we poll
# the filesystem.
while block_file.exists():
time.sleep(0.01)
return original_execute_model(self, scheduler_output)
# Patch execute_model to inject the wait
+1
View File
@@ -36,6 +36,7 @@ def evil_forward(self, *args, **kwargs):
raise Exception("Simulated illegal memory access on Rank 0!")
self.num_calls += 1
kwargs.setdefault("intermediate_tensors", None) # required for MRV2
return self.model(*args, **kwargs)
+7 -1
View File
@@ -66,7 +66,13 @@ else:
logger = init_logger(__name__)
DEFAULT_V2_MODEL_RUNNER_ARCHITECTURES = frozenset({"Qwen3ForCausalLM"})
DEFAULT_V2_MODEL_RUNNER_ARCHITECTURES = frozenset(
{
"LlamaForCausalLM",
"MistralForCausalLM",
"Qwen3ForCausalLM",
}
)
class OptimizationLevel(IntEnum):
+6
View File
@@ -344,6 +344,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.reset_encoder_cache()
self.reset_mm_cache()
def apply_sparse_weight_patches(self, *args, **kwargs) -> None:
# TODO: Use full version instead of import when fully migrated to v2
from vllm.v1.worker.gpu_model_runner import GPUModelRunner as GPUModelRunnerV1
GPUModelRunnerV1.apply_sparse_weight_patches(self, *args, **kwargs) # type: ignore[arg-type]
def update_config(self, *args, **kwargs) -> None:
# TODO(Wentao): Use full version instead of import when fully migrated to v2
from vllm.v1.worker.gpu_model_runner import GPUModelRunner as GPUModelRunnerV1