[Feat] allow inplace loading lora (#31326)

Signed-off-by: Jackmin801 <ongjackm@gmail.com>
Signed-off-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
Jackmin801
2026-01-19 18:15:20 -08:00
committed by GitHub
parent 05dc4bfab6
commit 12dab78f49
10 changed files with 262 additions and 7 deletions
+18
View File
@@ -210,6 +210,24 @@ Alternatively, follow these example steps to implement your own plugin:
For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md).
### In-Place LoRA Reloading
When dynamically loading LoRA adapters, you may need to replace an existing adapter with updated weights while keeping the same name. The `load_inplace` parameter enables this functionality. This commonly occurs in asynchronous reinforcement learning setups, where adapters are continuously updated and swapped in without interrupting ongoing inference.
When `load_inplace=True`, vLLM will replace the existing adapter with the new one.
Example request to load or replace a LoRA adapter with the same name:
```bash
curl -X POST http://localhost:8000/v1/load_lora_adapter \
-H "Content-Type: application/json" \
-d '{
"lora_name": "my-adapter",
"lora_path": "/path/to/adapter/v2",
"load_inplace": true
}'
```
## New format for `--lora-modules`
In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
+16
View File
@@ -195,6 +195,22 @@ def qwen3_lora_files():
return snapshot_download(repo_id="charent/self_cognition_Alice")
@pytest.fixture(scope="session")
def qwen3_meowing_lora_files():
"""Download Qwen3 LoRA files once per test session."""
from huggingface_hub import snapshot_download
return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Meow-LoRA")
@pytest.fixture(scope="session")
def qwen3_woofing_lora_files():
"""Download Qwen3 LoRA files once per test session."""
from huggingface_hub import snapshot_download
return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Woof-LoRA")
@pytest.fixture(scope="session")
def opt125_lora_files() -> str:
"""Download opt-125m LoRA files once per test session."""
@@ -104,6 +104,82 @@ async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files
assert dynamic_lora_model.id == "qwen3-lora-3"
@pytest.mark.asyncio
async def test_load_lora_adapter_with_same_name_replaces_inplace(
client: openai.AsyncOpenAI, qwen3_meowing_lora_files, qwen3_woofing_lora_files
):
"""Test that loading a LoRA adapter with the same name replaces it inplace."""
adapter_name = "replaceable-adapter"
messages = [
{"content": "Follow the instructions to make animal noises", "role": "system"},
{"content": "Make your favorite animal noise.", "role": "user"},
]
# Load LoRA that makes model meow
response = await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": adapter_name, "lora_path": qwen3_meowing_lora_files},
)
assert "success" in response.lower()
completion = await client.chat.completions.create(
model=adapter_name,
messages=messages,
max_tokens=10,
)
assert "Meow Meow Meow" in completion.choices[0].message.content
# Load LoRA that makes model woof
response = await client.post(
"load_lora_adapter",
cast_to=str,
body={
"lora_name": adapter_name,
"lora_path": qwen3_woofing_lora_files,
"load_inplace": True,
},
)
assert "success" in response.lower()
completion = await client.chat.completions.create(
model=adapter_name,
messages=messages,
max_tokens=10,
)
assert "Woof Woof Woof" in completion.choices[0].message.content
@pytest.mark.asyncio
async def test_load_lora_adapter_with_load_inplace_false_errors(
client: openai.AsyncOpenAI, qwen3_meowing_lora_files
):
"""Test that load_inplace=False returns an error when adapter already exists."""
adapter_name = "test-load-inplace-false"
# Load LoRA adapter first time (should succeed)
response = await client.post(
"load_lora_adapter",
cast_to=str,
body={"lora_name": adapter_name, "lora_path": qwen3_meowing_lora_files},
)
assert "success" in response.lower()
# Try to load the same adapter again with load_inplace=False (should fail)
with pytest.raises(openai.BadRequestError) as exc_info:
await client.post(
"load_lora_adapter",
cast_to=str,
body={
"lora_name": adapter_name,
"lora_path": qwen3_meowing_lora_files,
},
)
# Verify the error message
assert "already been loaded" in str(exc_info.value)
@pytest.mark.asyncio
async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
with pytest.raises(openai.NotFoundError):
+12
View File
@@ -233,6 +233,18 @@ def qwen3vl_vision_lora_files():
return snapshot_download(repo_id="EpochEcho/qwen3-4b-vl-lora-vision-connector")
@pytest.fixture(scope="session")
def qwen3_meowing_lora_files():
"""Download Qwen3 Meow LoRA files once per test session."""
return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Meow-LoRA")
@pytest.fixture(scope="session")
def qwen3_woofing_lora_files():
"""Download Qwen3 Woof LoRA files once per test session."""
return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Woof-LoRA")
@pytest.fixture(scope="session")
def tinyllama_lora_files():
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
+111 -2
View File
@@ -30,9 +30,11 @@ LORA_TEST_EXPECTED = [
]
def format_chatml_messages(prompt: str):
def format_chatml_messages(
prompt: str, system_prompt: str = "You are a helpful assistant."
) -> list[dict[str, str]]:
return [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
]
@@ -185,3 +187,110 @@ def test_multiple_lora_requests():
single_lora_request = lora_request[0]
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
assert len(PROMPTS) == len(outputs)
def test_load_inplace_offline_reload(
qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str
) -> None:
"""
Test that load_inplace=True allows reloading LoRA adapters with the same ID
in offline mode (using LLM class directly).
"""
llm = LLM(
model=MODEL_PATH,
enable_lora=True,
max_loras=2,
max_lora_rank=LORA_RANK,
max_model_len=512,
gpu_memory_utilization=0.5,
enforce_eager=True,
)
adapter_id = 1
messages = format_chatml_messages(
"Make your favorite animal noise.",
system_prompt="Follow the instructions to make animal noises",
)
sampling_params = SamplingParams(temperature=0, max_tokens=10)
# Load meowing LoRA with load_inplace=True
meowing_request = LoRARequest(
lora_name="test-adapter",
lora_int_id=adapter_id,
lora_path=qwen3_meowing_lora_files,
)
outputs = llm.chat([messages], sampling_params, lora_request=meowing_request)
first_output = outputs[0].outputs[0].text.strip()
assert "Meow Meow Meow" in first_output, (
f"Expected meowing output, got: {first_output}"
)
# Reload with woofing LoRA (same ID, different weights, load_inplace=True)
woofing_request = LoRARequest(
lora_name="test-adapter-woof",
lora_int_id=adapter_id, # Same ID
lora_path=qwen3_woofing_lora_files, # Different weights
load_inplace=True, # Force reload
)
outputs = llm.chat([messages], sampling_params, lora_request=woofing_request)
second_output = outputs[0].outputs[0].text.strip()
assert "Woof Woof Woof" in second_output, (
f"Expected woofing output, got: {second_output}"
)
def test_load_inplace_false_no_reload(
qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str
) -> None:
"""
Test that load_inplace=False prevents reloading when an adapter
with the same ID already exists.
"""
llm = LLM(
model=MODEL_PATH,
enable_lora=True,
max_loras=2,
max_lora_rank=LORA_RANK,
max_model_len=512,
gpu_memory_utilization=0.5,
enforce_eager=True,
)
adapter_id = 2
messages = format_chatml_messages(
"Make your favorite animal noise.",
system_prompt="Follow the instructions to make animal noises",
)
sampling_params = SamplingParams(temperature=0, max_tokens=10)
# Load meowing LoRA first with load_inplace=True
meowing_request_initial = LoRARequest(
lora_name="test-adapter-2",
lora_int_id=adapter_id,
lora_path=qwen3_meowing_lora_files,
)
outputs = llm.chat(
[messages], sampling_params, lora_request=meowing_request_initial
)
first_output = outputs[0].outputs[0].text.strip()
assert "Meow Meow Meow" in first_output, (
f"Expected meowing output, got: {first_output}"
)
# Try to load woofing LoRA with same ID but load_inplace=False
# This should NOT reload (adapter 2 already exists)
woofing_request_no_reload = LoRARequest(
lora_name="test-adapter-2-woof",
lora_int_id=adapter_id, # Same ID
lora_path=qwen3_woofing_lora_files,
)
outputs = llm.chat(
[messages], sampling_params, lora_request=woofing_request_no_reload
)
second_output = outputs[0].outputs[0].text.strip()
# Should still get meowing output because it didn't reload
assert "Meow Meow Meow" in second_output, (
f"Expected meowing output (no reload), got: {second_output}"
)
+13 -4
View File
@@ -132,9 +132,16 @@ class OpenAIServingModels:
return error_check_ret
lora_path = request.lora_path
unique_id = self.lora_id_counter.inc(1)
lora_int_id = (
self.lora_requests[lora_name].lora_int_id
if lora_name in self.lora_requests
else self.lora_id_counter.inc(1)
)
lora_request = LoRARequest(
lora_name=lora_name, lora_int_id=unique_id, lora_path=lora_path
lora_name=lora_name,
lora_int_id=lora_int_id,
lora_path=lora_path,
load_inplace=request.load_inplace,
)
if base_model_name is not None and self.is_base_model(base_model_name):
lora_request.base_model_name = base_model_name
@@ -187,11 +194,13 @@ class OpenAIServingModels:
status_code=HTTPStatus.BAD_REQUEST,
)
# If not loading inplace
# Check if the lora adapter with the given name already exists
if request.lora_name in self.lora_requests:
if not request.load_inplace and request.lora_name in self.lora_requests:
return create_error_response(
message=f"The lora adapter '{request.lora_name}' has already been "
"loaded.",
"loaded. If you want to load the adapter in place, set 'load_inplace'"
" to True.",
err_type="InvalidUserInput",
status_code=HTTPStatus.BAD_REQUEST,
)
@@ -36,6 +36,7 @@ def attach_router(app: FastAPI):
request_shape={
"lora_name": "body.name",
"lora_path": "body.src",
"load_inplace": "body.load_inplace || `false`",
},
)
@router.post("/v1/load_lora_adapter", dependencies=[Depends(validate_json_request)])
+1
View File
@@ -7,6 +7,7 @@ from pydantic import BaseModel, Field
class LoadLoRAAdapterRequest(BaseModel):
lora_name: str
lora_path: str
load_inplace: bool = False
class UnloadLoRAAdapterRequest(BaseModel):
+6
View File
@@ -15,6 +15,11 @@ class LoRARequest(
lora_int_id must be globally unique for a given adapter.
This is currently not enforced in vLLM.
load_inplace: If True, forces reloading the adapter even if one
with the same lora_int_id already exists in the cache. This replaces
the existing adapter in-place. If False (default), only loads if the
adapter is not already loaded.
"""
lora_name: str
@@ -22,6 +27,7 @@ class LoRARequest(
lora_path: str = ""
base_model_name: str | None = msgspec.field(default=None)
tensorizer_config_dict: dict | None = None
load_inplace: bool = False
def __post_init__(self):
if self.lora_int_id < 1:
+8 -1
View File
@@ -254,13 +254,20 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
# This is ok because it's currently only called from
# the single-threaded core engine loop.
if lora_request.lora_int_id not in self.list_adapters():
if (
lora_request.lora_int_id not in self.list_adapters()
or lora_request.load_inplace
):
# Load the new adapter first to ensure it is actually valid, before
# evicting any existing adapters.
# This may cause the # of loaded lora adapters to very temporarily
# exceed `--max-cpu-loras`.
lora = self._load_adapter(lora_request)
# Remove the existing adapter if it exists
# Use case for LoRA inplace
self._adapter_manager.remove_adapter(lora.id)
# Loading succeeded, now check if we will exceed cache capacity and
# evict if the oldest adapter if so
if len(self._adapter_manager) + 1 > self._adapter_manager.capacity: