mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Feat] allow inplace loading lora (#31326)
Signed-off-by: Jackmin801 <ongjackm@gmail.com> Signed-off-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
@@ -210,6 +210,24 @@ Alternatively, follow these example steps to implement your own plugin:
|
||||
|
||||
For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md).
|
||||
|
||||
### In-Place LoRA Reloading
|
||||
|
||||
When dynamically loading LoRA adapters, you may need to replace an existing adapter with updated weights while keeping the same name. The `load_inplace` parameter enables this functionality. This commonly occurs in asynchronous reinforcement learning setups, where adapters are continuously updated and swapped in without interrupting ongoing inference.
|
||||
|
||||
When `load_inplace=True`, vLLM will replace the existing adapter with the new one.
|
||||
|
||||
Example request to load or replace a LoRA adapter with the same name:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/v1/load_lora_adapter \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"lora_name": "my-adapter",
|
||||
"lora_path": "/path/to/adapter/v2",
|
||||
"load_inplace": true
|
||||
}'
|
||||
```
|
||||
|
||||
## New format for `--lora-modules`
|
||||
|
||||
In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
|
||||
|
||||
@@ -195,6 +195,22 @@ def qwen3_lora_files():
|
||||
return snapshot_download(repo_id="charent/self_cognition_Alice")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen3_meowing_lora_files():
|
||||
"""Download Qwen3 LoRA files once per test session."""
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Meow-LoRA")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen3_woofing_lora_files():
|
||||
"""Download Qwen3 LoRA files once per test session."""
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Woof-LoRA")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def opt125_lora_files() -> str:
|
||||
"""Download opt-125m LoRA files once per test session."""
|
||||
|
||||
@@ -104,6 +104,82 @@ async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files
|
||||
assert dynamic_lora_model.id == "qwen3-lora-3"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_lora_adapter_with_same_name_replaces_inplace(
|
||||
client: openai.AsyncOpenAI, qwen3_meowing_lora_files, qwen3_woofing_lora_files
|
||||
):
|
||||
"""Test that loading a LoRA adapter with the same name replaces it inplace."""
|
||||
adapter_name = "replaceable-adapter"
|
||||
messages = [
|
||||
{"content": "Follow the instructions to make animal noises", "role": "system"},
|
||||
{"content": "Make your favorite animal noise.", "role": "user"},
|
||||
]
|
||||
|
||||
# Load LoRA that makes model meow
|
||||
response = await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": adapter_name, "lora_path": qwen3_meowing_lora_files},
|
||||
)
|
||||
assert "success" in response.lower()
|
||||
|
||||
completion = await client.chat.completions.create(
|
||||
model=adapter_name,
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
)
|
||||
assert "Meow Meow Meow" in completion.choices[0].message.content
|
||||
|
||||
# Load LoRA that makes model woof
|
||||
response = await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={
|
||||
"lora_name": adapter_name,
|
||||
"lora_path": qwen3_woofing_lora_files,
|
||||
"load_inplace": True,
|
||||
},
|
||||
)
|
||||
assert "success" in response.lower()
|
||||
|
||||
completion = await client.chat.completions.create(
|
||||
model=adapter_name,
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
)
|
||||
assert "Woof Woof Woof" in completion.choices[0].message.content
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_lora_adapter_with_load_inplace_false_errors(
|
||||
client: openai.AsyncOpenAI, qwen3_meowing_lora_files
|
||||
):
|
||||
"""Test that load_inplace=False returns an error when adapter already exists."""
|
||||
adapter_name = "test-load-inplace-false"
|
||||
|
||||
# Load LoRA adapter first time (should succeed)
|
||||
response = await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": adapter_name, "lora_path": qwen3_meowing_lora_files},
|
||||
)
|
||||
assert "success" in response.lower()
|
||||
|
||||
# Try to load the same adapter again with load_inplace=False (should fail)
|
||||
with pytest.raises(openai.BadRequestError) as exc_info:
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={
|
||||
"lora_name": adapter_name,
|
||||
"lora_path": qwen3_meowing_lora_files,
|
||||
},
|
||||
)
|
||||
|
||||
# Verify the error message
|
||||
assert "already been loaded" in str(exc_info.value)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
|
||||
with pytest.raises(openai.NotFoundError):
|
||||
|
||||
@@ -233,6 +233,18 @@ def qwen3vl_vision_lora_files():
|
||||
return snapshot_download(repo_id="EpochEcho/qwen3-4b-vl-lora-vision-connector")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen3_meowing_lora_files():
|
||||
"""Download Qwen3 Meow LoRA files once per test session."""
|
||||
return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Meow-LoRA")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen3_woofing_lora_files():
|
||||
"""Download Qwen3 Woof LoRA files once per test session."""
|
||||
return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Woof-LoRA")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tinyllama_lora_files():
|
||||
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
|
||||
|
||||
@@ -30,9 +30,11 @@ LORA_TEST_EXPECTED = [
|
||||
]
|
||||
|
||||
|
||||
def format_chatml_messages(prompt: str):
|
||||
def format_chatml_messages(
|
||||
prompt: str, system_prompt: str = "You are a helpful assistant."
|
||||
) -> list[dict[str, str]]:
|
||||
return [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
|
||||
@@ -185,3 +187,110 @@ def test_multiple_lora_requests():
|
||||
single_lora_request = lora_request[0]
|
||||
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
|
||||
def test_load_inplace_offline_reload(
|
||||
qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str
|
||||
) -> None:
|
||||
"""
|
||||
Test that load_inplace=True allows reloading LoRA adapters with the same ID
|
||||
in offline mode (using LLM class directly).
|
||||
"""
|
||||
llm = LLM(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=512,
|
||||
gpu_memory_utilization=0.5,
|
||||
enforce_eager=True,
|
||||
)
|
||||
adapter_id = 1
|
||||
messages = format_chatml_messages(
|
||||
"Make your favorite animal noise.",
|
||||
system_prompt="Follow the instructions to make animal noises",
|
||||
)
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
|
||||
# Load meowing LoRA with load_inplace=True
|
||||
meowing_request = LoRARequest(
|
||||
lora_name="test-adapter",
|
||||
lora_int_id=adapter_id,
|
||||
lora_path=qwen3_meowing_lora_files,
|
||||
)
|
||||
|
||||
outputs = llm.chat([messages], sampling_params, lora_request=meowing_request)
|
||||
first_output = outputs[0].outputs[0].text.strip()
|
||||
assert "Meow Meow Meow" in first_output, (
|
||||
f"Expected meowing output, got: {first_output}"
|
||||
)
|
||||
|
||||
# Reload with woofing LoRA (same ID, different weights, load_inplace=True)
|
||||
woofing_request = LoRARequest(
|
||||
lora_name="test-adapter-woof",
|
||||
lora_int_id=adapter_id, # Same ID
|
||||
lora_path=qwen3_woofing_lora_files, # Different weights
|
||||
load_inplace=True, # Force reload
|
||||
)
|
||||
|
||||
outputs = llm.chat([messages], sampling_params, lora_request=woofing_request)
|
||||
second_output = outputs[0].outputs[0].text.strip()
|
||||
assert "Woof Woof Woof" in second_output, (
|
||||
f"Expected woofing output, got: {second_output}"
|
||||
)
|
||||
|
||||
|
||||
def test_load_inplace_false_no_reload(
|
||||
qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str
|
||||
) -> None:
|
||||
"""
|
||||
Test that load_inplace=False prevents reloading when an adapter
|
||||
with the same ID already exists.
|
||||
"""
|
||||
llm = LLM(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=512,
|
||||
gpu_memory_utilization=0.5,
|
||||
enforce_eager=True,
|
||||
)
|
||||
adapter_id = 2
|
||||
messages = format_chatml_messages(
|
||||
"Make your favorite animal noise.",
|
||||
system_prompt="Follow the instructions to make animal noises",
|
||||
)
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
|
||||
# Load meowing LoRA first with load_inplace=True
|
||||
meowing_request_initial = LoRARequest(
|
||||
lora_name="test-adapter-2",
|
||||
lora_int_id=adapter_id,
|
||||
lora_path=qwen3_meowing_lora_files,
|
||||
)
|
||||
|
||||
outputs = llm.chat(
|
||||
[messages], sampling_params, lora_request=meowing_request_initial
|
||||
)
|
||||
first_output = outputs[0].outputs[0].text.strip()
|
||||
assert "Meow Meow Meow" in first_output, (
|
||||
f"Expected meowing output, got: {first_output}"
|
||||
)
|
||||
|
||||
# Try to load woofing LoRA with same ID but load_inplace=False
|
||||
# This should NOT reload (adapter 2 already exists)
|
||||
woofing_request_no_reload = LoRARequest(
|
||||
lora_name="test-adapter-2-woof",
|
||||
lora_int_id=adapter_id, # Same ID
|
||||
lora_path=qwen3_woofing_lora_files,
|
||||
)
|
||||
|
||||
outputs = llm.chat(
|
||||
[messages], sampling_params, lora_request=woofing_request_no_reload
|
||||
)
|
||||
second_output = outputs[0].outputs[0].text.strip()
|
||||
# Should still get meowing output because it didn't reload
|
||||
assert "Meow Meow Meow" in second_output, (
|
||||
f"Expected meowing output (no reload), got: {second_output}"
|
||||
)
|
||||
|
||||
@@ -132,9 +132,16 @@ class OpenAIServingModels:
|
||||
return error_check_ret
|
||||
|
||||
lora_path = request.lora_path
|
||||
unique_id = self.lora_id_counter.inc(1)
|
||||
lora_int_id = (
|
||||
self.lora_requests[lora_name].lora_int_id
|
||||
if lora_name in self.lora_requests
|
||||
else self.lora_id_counter.inc(1)
|
||||
)
|
||||
lora_request = LoRARequest(
|
||||
lora_name=lora_name, lora_int_id=unique_id, lora_path=lora_path
|
||||
lora_name=lora_name,
|
||||
lora_int_id=lora_int_id,
|
||||
lora_path=lora_path,
|
||||
load_inplace=request.load_inplace,
|
||||
)
|
||||
if base_model_name is not None and self.is_base_model(base_model_name):
|
||||
lora_request.base_model_name = base_model_name
|
||||
@@ -187,11 +194,13 @@ class OpenAIServingModels:
|
||||
status_code=HTTPStatus.BAD_REQUEST,
|
||||
)
|
||||
|
||||
# If not loading inplace
|
||||
# Check if the lora adapter with the given name already exists
|
||||
if request.lora_name in self.lora_requests:
|
||||
if not request.load_inplace and request.lora_name in self.lora_requests:
|
||||
return create_error_response(
|
||||
message=f"The lora adapter '{request.lora_name}' has already been "
|
||||
"loaded.",
|
||||
"loaded. If you want to load the adapter in place, set 'load_inplace'"
|
||||
" to True.",
|
||||
err_type="InvalidUserInput",
|
||||
status_code=HTTPStatus.BAD_REQUEST,
|
||||
)
|
||||
|
||||
@@ -36,6 +36,7 @@ def attach_router(app: FastAPI):
|
||||
request_shape={
|
||||
"lora_name": "body.name",
|
||||
"lora_path": "body.src",
|
||||
"load_inplace": "body.load_inplace || `false`",
|
||||
},
|
||||
)
|
||||
@router.post("/v1/load_lora_adapter", dependencies=[Depends(validate_json_request)])
|
||||
|
||||
@@ -7,6 +7,7 @@ from pydantic import BaseModel, Field
|
||||
class LoadLoRAAdapterRequest(BaseModel):
|
||||
lora_name: str
|
||||
lora_path: str
|
||||
load_inplace: bool = False
|
||||
|
||||
|
||||
class UnloadLoRAAdapterRequest(BaseModel):
|
||||
|
||||
@@ -15,6 +15,11 @@ class LoRARequest(
|
||||
|
||||
lora_int_id must be globally unique for a given adapter.
|
||||
This is currently not enforced in vLLM.
|
||||
|
||||
load_inplace: If True, forces reloading the adapter even if one
|
||||
with the same lora_int_id already exists in the cache. This replaces
|
||||
the existing adapter in-place. If False (default), only loads if the
|
||||
adapter is not already loaded.
|
||||
"""
|
||||
|
||||
lora_name: str
|
||||
@@ -22,6 +27,7 @@ class LoRARequest(
|
||||
lora_path: str = ""
|
||||
base_model_name: str | None = msgspec.field(default=None)
|
||||
tensorizer_config_dict: dict | None = None
|
||||
load_inplace: bool = False
|
||||
|
||||
def __post_init__(self):
|
||||
if self.lora_int_id < 1:
|
||||
|
||||
@@ -254,13 +254,20 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
|
||||
# This is ok because it's currently only called from
|
||||
# the single-threaded core engine loop.
|
||||
|
||||
if lora_request.lora_int_id not in self.list_adapters():
|
||||
if (
|
||||
lora_request.lora_int_id not in self.list_adapters()
|
||||
or lora_request.load_inplace
|
||||
):
|
||||
# Load the new adapter first to ensure it is actually valid, before
|
||||
# evicting any existing adapters.
|
||||
# This may cause the # of loaded lora adapters to very temporarily
|
||||
# exceed `--max-cpu-loras`.
|
||||
lora = self._load_adapter(lora_request)
|
||||
|
||||
# Remove the existing adapter if it exists
|
||||
# Use case for LoRA inplace
|
||||
self._adapter_manager.remove_adapter(lora.id)
|
||||
|
||||
# Loading succeeded, now check if we will exceed cache capacity and
|
||||
# evict if the oldest adapter if so
|
||||
if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
|
||||
|
||||
Reference in New Issue
Block a user