[Feat] allow inplace loading lora (#31326)

Signed-off-by: Jackmin801 <ongjackm@gmail.com> Signed-off-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2026-06-06 00:16:14 +00:00 · 2026-01-19 18:15:20 -08:00
parent 05dc4bfab6
commit 12dab78f49
10 changed files with 262 additions and 7 deletions
@@ -210,6 +210,24 @@ Alternatively, follow these example steps to implement your own plugin:

    For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md).

+### In-Place LoRA Reloading
+
+When dynamically loading LoRA adapters, you may need to replace an existing adapter with updated weights while keeping the same name. The `load_inplace` parameter enables this functionality. This commonly occurs in asynchronous reinforcement learning setups, where adapters are continuously updated and swapped in without interrupting ongoing inference.
+
+When `load_inplace=True`, vLLM will replace the existing adapter with the new one.
+
+Example request to load or replace a LoRA adapter with the same name:
+
+```bash
+curl -X POST http://localhost:8000/v1/load_lora_adapter \
+-H "Content-Type: application/json" \
+-d '{
+    "lora_name": "my-adapter",
+    "lora_path": "/path/to/adapter/v2",
+    "load_inplace": true
+}'
+```
+
 ## New format for `--lora-modules`

 In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
@@ -195,6 +195,22 @@ def qwen3_lora_files():
    return snapshot_download(repo_id="charent/self_cognition_Alice")


+@pytest.fixture(scope="session")
+def qwen3_meowing_lora_files():
+    """Download Qwen3 LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Meow-LoRA")
+
+
+@pytest.fixture(scope="session")
+def qwen3_woofing_lora_files():
+    """Download Qwen3 LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Woof-LoRA")
+
+
@pytest.fixture(scope="session")
 def opt125_lora_files() -> str:
    """Download opt-125m LoRA files once per test session."""
@@ -104,6 +104,82 @@ async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files
    assert dynamic_lora_model.id == "qwen3-lora-3"


+@pytest.mark.asyncio
+async def test_load_lora_adapter_with_same_name_replaces_inplace(
+    client: openai.AsyncOpenAI, qwen3_meowing_lora_files, qwen3_woofing_lora_files
+):
+    """Test that loading a LoRA adapter with the same name replaces it inplace."""
+    adapter_name = "replaceable-adapter"
+    messages = [
+        {"content": "Follow the instructions to make animal noises", "role": "system"},
+        {"content": "Make your favorite animal noise.", "role": "user"},
+    ]
+
+    # Load LoRA that makes model meow
+    response = await client.post(
+        "load_lora_adapter",
+        cast_to=str,
+        body={"lora_name": adapter_name, "lora_path": qwen3_meowing_lora_files},
+    )
+    assert "success" in response.lower()
+
+    completion = await client.chat.completions.create(
+        model=adapter_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    assert "Meow Meow Meow" in completion.choices[0].message.content
+
+    # Load LoRA that makes model woof
+    response = await client.post(
+        "load_lora_adapter",
+        cast_to=str,
+        body={
+            "lora_name": adapter_name,
+            "lora_path": qwen3_woofing_lora_files,
+            "load_inplace": True,
+        },
+    )
+    assert "success" in response.lower()
+
+    completion = await client.chat.completions.create(
+        model=adapter_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    assert "Woof Woof Woof" in completion.choices[0].message.content
+
+
+@pytest.mark.asyncio
+async def test_load_lora_adapter_with_load_inplace_false_errors(
+    client: openai.AsyncOpenAI, qwen3_meowing_lora_files
+):
+    """Test that load_inplace=False returns an error when adapter already exists."""
+    adapter_name = "test-load-inplace-false"
+
+    # Load LoRA adapter first time (should succeed)
+    response = await client.post(
+        "load_lora_adapter",
+        cast_to=str,
+        body={"lora_name": adapter_name, "lora_path": qwen3_meowing_lora_files},
+    )
+    assert "success" in response.lower()
+
+    # Try to load the same adapter again with load_inplace=False (should fail)
+    with pytest.raises(openai.BadRequestError) as exc_info:
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={
+                "lora_name": adapter_name,
+                "lora_path": qwen3_meowing_lora_files,
+            },
+        )
+
+    # Verify the error message
+    assert "already been loaded" in str(exc_info.value)
+
+
@pytest.mark.asyncio
 async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
    with pytest.raises(openai.NotFoundError):
@@ -233,6 +233,18 @@ def qwen3vl_vision_lora_files():
    return snapshot_download(repo_id="EpochEcho/qwen3-4b-vl-lora-vision-connector")


+@pytest.fixture(scope="session")
+def qwen3_meowing_lora_files():
+    """Download Qwen3 Meow LoRA files once per test session."""
+    return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Meow-LoRA")
+
+
+@pytest.fixture(scope="session")
+def qwen3_woofing_lora_files():
+    """Download Qwen3 Woof LoRA files once per test session."""
+    return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Woof-LoRA")
+
+
@pytest.fixture(scope="session")
 def tinyllama_lora_files():
    return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
@@ -30,9 +30,11 @@ LORA_TEST_EXPECTED = [
 ]


-def format_chatml_messages(prompt: str):
+def format_chatml_messages(
+    prompt: str, system_prompt: str = "You are a helpful assistant."
+) -> list[dict[str, str]]:
    return [
-        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
    ]

@@ -185,3 +187,110 @@ def test_multiple_lora_requests():
    single_lora_request = lora_request[0]
    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
    assert len(PROMPTS) == len(outputs)
+
+
+def test_load_inplace_offline_reload(
+    qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str
+) -> None:
+    """
+    Test that load_inplace=True allows reloading LoRA adapters with the same ID
+    in offline mode (using LLM class directly).
+    """
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+    )
+    adapter_id = 1
+    messages = format_chatml_messages(
+        "Make your favorite animal noise.",
+        system_prompt="Follow the instructions to make animal noises",
+    )
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+
+    # Load meowing LoRA with load_inplace=True
+    meowing_request = LoRARequest(
+        lora_name="test-adapter",
+        lora_int_id=adapter_id,
+        lora_path=qwen3_meowing_lora_files,
+    )
+
+    outputs = llm.chat([messages], sampling_params, lora_request=meowing_request)
+    first_output = outputs[0].outputs[0].text.strip()
+    assert "Meow Meow Meow" in first_output, (
+        f"Expected meowing output, got: {first_output}"
+    )
+
+    # Reload with woofing LoRA (same ID, different weights, load_inplace=True)
+    woofing_request = LoRARequest(
+        lora_name="test-adapter-woof",
+        lora_int_id=adapter_id,  # Same ID
+        lora_path=qwen3_woofing_lora_files,  # Different weights
+        load_inplace=True,  # Force reload
+    )
+
+    outputs = llm.chat([messages], sampling_params, lora_request=woofing_request)
+    second_output = outputs[0].outputs[0].text.strip()
+    assert "Woof Woof Woof" in second_output, (
+        f"Expected woofing output, got: {second_output}"
+    )
+
+
+def test_load_inplace_false_no_reload(
+    qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str
+) -> None:
+    """
+    Test that load_inplace=False prevents reloading when an adapter
+    with the same ID already exists.
+    """
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+    )
+    adapter_id = 2
+    messages = format_chatml_messages(
+        "Make your favorite animal noise.",
+        system_prompt="Follow the instructions to make animal noises",
+    )
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+
+    # Load meowing LoRA first with load_inplace=True
+    meowing_request_initial = LoRARequest(
+        lora_name="test-adapter-2",
+        lora_int_id=adapter_id,
+        lora_path=qwen3_meowing_lora_files,
+    )
+
+    outputs = llm.chat(
+        [messages], sampling_params, lora_request=meowing_request_initial
+    )
+    first_output = outputs[0].outputs[0].text.strip()
+    assert "Meow Meow Meow" in first_output, (
+        f"Expected meowing output, got: {first_output}"
+    )
+
+    # Try to load woofing LoRA with same ID but load_inplace=False
+    # This should NOT reload (adapter 2 already exists)
+    woofing_request_no_reload = LoRARequest(
+        lora_name="test-adapter-2-woof",
+        lora_int_id=adapter_id,  # Same ID
+        lora_path=qwen3_woofing_lora_files,
+    )
+
+    outputs = llm.chat(
+        [messages], sampling_params, lora_request=woofing_request_no_reload
+    )
+    second_output = outputs[0].outputs[0].text.strip()
+    # Should still get meowing output because it didn't reload
+    assert "Meow Meow Meow" in second_output, (
+        f"Expected meowing output (no reload), got: {second_output}"
+    )
@@ -132,9 +132,16 @@ class OpenAIServingModels:
                return error_check_ret

            lora_path = request.lora_path
-            unique_id = self.lora_id_counter.inc(1)
+            lora_int_id = (
+                self.lora_requests[lora_name].lora_int_id
+                if lora_name in self.lora_requests
+                else self.lora_id_counter.inc(1)
+            )
            lora_request = LoRARequest(
-                lora_name=lora_name, lora_int_id=unique_id, lora_path=lora_path
+                lora_name=lora_name,
+                lora_int_id=lora_int_id,
+                lora_path=lora_path,
+                load_inplace=request.load_inplace,
            )
            if base_model_name is not None and self.is_base_model(base_model_name):
                lora_request.base_model_name = base_model_name
@@ -187,11 +194,13 @@ class OpenAIServingModels:
                status_code=HTTPStatus.BAD_REQUEST,
            )

+        # If not loading inplace
        # Check if the lora adapter with the given name already exists
-        if request.lora_name in self.lora_requests:
+        if not request.load_inplace and request.lora_name in self.lora_requests:
            return create_error_response(
                message=f"The lora adapter '{request.lora_name}' has already been "
-                "loaded.",
+                "loaded. If you want to load the adapter in place, set 'load_inplace'"
+                " to True.",
                err_type="InvalidUserInput",
                status_code=HTTPStatus.BAD_REQUEST,
            )
@@ -36,6 +36,7 @@ def attach_router(app: FastAPI):
        request_shape={
            "lora_name": "body.name",
            "lora_path": "body.src",
+            "load_inplace": "body.load_inplace || `false`",
        },
    )
    @router.post("/v1/load_lora_adapter", dependencies=[Depends(validate_json_request)])
@@ -7,6 +7,7 @@ from pydantic import BaseModel, Field
 class LoadLoRAAdapterRequest(BaseModel):
    lora_name: str
    lora_path: str
+    load_inplace: bool = False


 class UnloadLoRAAdapterRequest(BaseModel):
@@ -15,6 +15,11 @@ class LoRARequest(

    lora_int_id must be globally unique for a given adapter.
    This is currently not enforced in vLLM.
+
+    load_inplace: If True, forces reloading the adapter even if one
+        with the same lora_int_id already exists in the cache. This replaces
+        the existing adapter in-place. If False (default), only loads if the
+        adapter is not already loaded.
    """

    lora_name: str
@@ -22,6 +27,7 @@ class LoRARequest(
    lora_path: str = ""
    base_model_name: str | None = msgspec.field(default=None)
    tensorizer_config_dict: dict | None = None
+    load_inplace: bool = False

    def __post_init__(self):
        if self.lora_int_id < 1:
@@ -254,13 +254,20 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
        # This is ok because it's currently only called from
        # the single-threaded core engine loop.

-        if lora_request.lora_int_id not in self.list_adapters():
+        if (
+            lora_request.lora_int_id not in self.list_adapters()
+            or lora_request.load_inplace
+        ):
            # Load the new adapter first to ensure it is actually valid, before
            # evicting any existing adapters.
            # This may cause the # of loaded lora adapters to very temporarily
            # exceed `--max-cpu-loras`.
            lora = self._load_adapter(lora_request)

+            # Remove the existing adapter if it exists
+            # Use case for LoRA inplace
+            self._adapter_manager.remove_adapter(lora.id)
+
            # Loading succeeded, now check if we will exceed cache capacity and
            # evict if the oldest adapter if so
            if len(self._adapter_manager) + 1 > self._adapter_manager.capacity: