[MRV2] Also enable MRV2 for Llama and Mistral dense models (#43458)

Signed-off-by: Nick Hill <nickhill123@gmail.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: yewentao256 <zhyanwentao@126.com>
2026-06-06 00:16:14 +00:00 · 2026-06-02 11:18:46 -07:00
parent ed9a7526b6
commit da107a59e5
5 changed files with 58 additions and 8 deletions
@@ -90,6 +90,26 @@ def test_v2_model_runner_env_tri_state(monkeypatch, env_value, expected):
            ),
            True,
        ),
+        (
+            SimpleNamespace(
+                model="meta-llama/Llama-3.2-1B",
+                architectures=["LlamaForCausalLM"],
+                runner_type="generate",
+                is_moe=False,
+                is_quantized=False,
+            ),
+            True,
+        ),
+        (
+            SimpleNamespace(
+                model="mistralai/Mistral-7B-v0.1",
+                architectures=["MistralForCausalLM"],
+                runner_type="generate",
+                is_moe=False,
+                is_quantized=False,
+            ),
+            True,
+        ),
        (
            SimpleNamespace(
                model="facebook/opt-125m",
@@ -184,14 +184,31 @@ async def test_abort_during_final_step(async_scheduling: bool):
        original_execute_model = Worker.execute_model

        def execute_model_with_wait(self, scheduler_output):
-            # Signal that execute_model has been called by deleting ready_file
-            if ready_file.exists():
-                ready_file.unlink()
+            # V2's `gpu_worker.compile_or_warm_up_model` calls
+            # `warmup_kernels(...)` during engine init, which itself calls
+            # `Worker.execute_model` three times (prefill / decode / cleanup)
+            # to JIT compile triton kernels. None of those carry the test's
+            # request id, so we only stall when our actual request is being
+            # processed.
+            scheduled = scheduler_output.num_scheduled_tokens or {}
+            finished = scheduler_output.finished_req_ids or set()

-            # Wait for the block file to be deleted (triggered from test after abort)
-            # This runs in the worker process (after fork), so we poll the filesystem
-            while block_file.exists():
-                time.sleep(0.01)
+            def is_target_request(req_ids):
+                return any(
+                    rid == request_id or rid.startswith(f"{request_id}-")
+                    for rid in req_ids
+                )
+
+            if is_target_request(scheduled) or is_target_request(finished):
+                # Signal that execute_model has been called by deleting ready_file
+                if ready_file.exists():
+                    ready_file.unlink()
+
+                # Wait for the block file to be deleted (triggered from test after
+                # abort). This runs in the worker process (after fork), so we poll
+                # the filesystem.
+                while block_file.exists():
+                    time.sleep(0.01)
            return original_execute_model(self, scheduler_output)

        # Patch execute_model to inject the wait
@@ -36,6 +36,7 @@ def evil_forward(self, *args, **kwargs):
        raise Exception("Simulated illegal memory access on Rank 0!")
    self.num_calls += 1

+    kwargs.setdefault("intermediate_tensors", None)  # required for MRV2
    return self.model(*args, **kwargs)


@@ -66,7 +66,13 @@ else:

 logger = init_logger(__name__)

-DEFAULT_V2_MODEL_RUNNER_ARCHITECTURES = frozenset({"Qwen3ForCausalLM"})
+DEFAULT_V2_MODEL_RUNNER_ARCHITECTURES = frozenset(
+    {
+        "LlamaForCausalLM",
+        "MistralForCausalLM",
+        "Qwen3ForCausalLM",
+    }
+)


 class OptimizationLevel(IntEnum):
@@ -344,6 +344,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        self.reset_encoder_cache()
        self.reset_mm_cache()

+    def apply_sparse_weight_patches(self, *args, **kwargs) -> None:
+        # TODO: Use full version instead of import when fully migrated to v2
+        from vllm.v1.worker.gpu_model_runner import GPUModelRunner as GPUModelRunnerV1
+
+        GPUModelRunnerV1.apply_sparse_weight_patches(self, *args, **kwargs)  # type: ignore[arg-type]
+
    def update_config(self, *args, **kwargs) -> None:
        # TODO(Wentao): Use full version instead of import when fully migrated to v2
        from vllm.v1.worker.gpu_model_runner import GPUModelRunner as GPUModelRunnerV1