mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[MRV2] Also enable MRV2 for Llama and Mistral dense models (#43458)
Signed-off-by: Nick Hill <nickhill123@gmail.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
@@ -90,6 +90,26 @@ def test_v2_model_runner_env_tri_state(monkeypatch, env_value, expected):
|
||||
),
|
||||
True,
|
||||
),
|
||||
(
|
||||
SimpleNamespace(
|
||||
model="meta-llama/Llama-3.2-1B",
|
||||
architectures=["LlamaForCausalLM"],
|
||||
runner_type="generate",
|
||||
is_moe=False,
|
||||
is_quantized=False,
|
||||
),
|
||||
True,
|
||||
),
|
||||
(
|
||||
SimpleNamespace(
|
||||
model="mistralai/Mistral-7B-v0.1",
|
||||
architectures=["MistralForCausalLM"],
|
||||
runner_type="generate",
|
||||
is_moe=False,
|
||||
is_quantized=False,
|
||||
),
|
||||
True,
|
||||
),
|
||||
(
|
||||
SimpleNamespace(
|
||||
model="facebook/opt-125m",
|
||||
|
||||
@@ -184,14 +184,31 @@ async def test_abort_during_final_step(async_scheduling: bool):
|
||||
original_execute_model = Worker.execute_model
|
||||
|
||||
def execute_model_with_wait(self, scheduler_output):
|
||||
# Signal that execute_model has been called by deleting ready_file
|
||||
if ready_file.exists():
|
||||
ready_file.unlink()
|
||||
# V2's `gpu_worker.compile_or_warm_up_model` calls
|
||||
# `warmup_kernels(...)` during engine init, which itself calls
|
||||
# `Worker.execute_model` three times (prefill / decode / cleanup)
|
||||
# to JIT compile triton kernels. None of those carry the test's
|
||||
# request id, so we only stall when our actual request is being
|
||||
# processed.
|
||||
scheduled = scheduler_output.num_scheduled_tokens or {}
|
||||
finished = scheduler_output.finished_req_ids or set()
|
||||
|
||||
# Wait for the block file to be deleted (triggered from test after abort)
|
||||
# This runs in the worker process (after fork), so we poll the filesystem
|
||||
while block_file.exists():
|
||||
time.sleep(0.01)
|
||||
def is_target_request(req_ids):
|
||||
return any(
|
||||
rid == request_id or rid.startswith(f"{request_id}-")
|
||||
for rid in req_ids
|
||||
)
|
||||
|
||||
if is_target_request(scheduled) or is_target_request(finished):
|
||||
# Signal that execute_model has been called by deleting ready_file
|
||||
if ready_file.exists():
|
||||
ready_file.unlink()
|
||||
|
||||
# Wait for the block file to be deleted (triggered from test after
|
||||
# abort). This runs in the worker process (after fork), so we poll
|
||||
# the filesystem.
|
||||
while block_file.exists():
|
||||
time.sleep(0.01)
|
||||
return original_execute_model(self, scheduler_output)
|
||||
|
||||
# Patch execute_model to inject the wait
|
||||
|
||||
@@ -36,6 +36,7 @@ def evil_forward(self, *args, **kwargs):
|
||||
raise Exception("Simulated illegal memory access on Rank 0!")
|
||||
self.num_calls += 1
|
||||
|
||||
kwargs.setdefault("intermediate_tensors", None) # required for MRV2
|
||||
return self.model(*args, **kwargs)
|
||||
|
||||
|
||||
|
||||
+7
-1
@@ -66,7 +66,13 @@ else:
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
DEFAULT_V2_MODEL_RUNNER_ARCHITECTURES = frozenset({"Qwen3ForCausalLM"})
|
||||
DEFAULT_V2_MODEL_RUNNER_ARCHITECTURES = frozenset(
|
||||
{
|
||||
"LlamaForCausalLM",
|
||||
"MistralForCausalLM",
|
||||
"Qwen3ForCausalLM",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class OptimizationLevel(IntEnum):
|
||||
|
||||
@@ -344,6 +344,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.reset_encoder_cache()
|
||||
self.reset_mm_cache()
|
||||
|
||||
def apply_sparse_weight_patches(self, *args, **kwargs) -> None:
|
||||
# TODO: Use full version instead of import when fully migrated to v2
|
||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner as GPUModelRunnerV1
|
||||
|
||||
GPUModelRunnerV1.apply_sparse_weight_patches(self, *args, **kwargs) # type: ignore[arg-type]
|
||||
|
||||
def update_config(self, *args, **kwargs) -> None:
|
||||
# TODO(Wentao): Use full version instead of import when fully migrated to v2
|
||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner as GPUModelRunnerV1
|
||||
|
||||
Reference in New Issue
Block a user