mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Model] Deprecate the score task (this will not affect users). (#37537)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
@@ -31,28 +31,29 @@ Of course, we also have "plugin" tasks that allow users to customize input and o
|
||||
|
||||
### Pooling Tasks
|
||||
|
||||
| Pooling Tasks | Granularity | Outputs |
|
||||
|--------------------|---------------|-------------------------------------------------|
|
||||
| `classify` | Sequence-wise | probability vector of classes for each sequence |
|
||||
| `score` (see note) | Sequence-wise | reranker score for each sequence |
|
||||
| `embed` | Sequence-wise | vector representations for each sequence |
|
||||
| `token_classify` | Token-wise | probability vector of classes for each token |
|
||||
| `token_embed` | Token-wise | vector representations for each token |
|
||||
| Pooling Tasks | Granularity | Outputs |
|
||||
|-----------------------|---------------|-------------------------------------------------|
|
||||
| `classify` (see note) | Sequence-wise | probability vector of classes for each sequence |
|
||||
| `embed` | Sequence-wise | vector representations for each sequence |
|
||||
| `token_classify` | Token-wise | probability vector of classes for each token |
|
||||
| `token_embed` | Token-wise | vector representations for each token |
|
||||
|
||||
!!! note
|
||||
Within classification tasks, there is a specialized subcategory: Cross-encoder (aka reranker) models. These models are a subset of classification models that accept two prompts as input and output num_labels equal to 1.
|
||||
|
||||
### Score Types
|
||||
|
||||
| Pooling Tasks | Granularity | Outputs | Score Types | scoring function |
|
||||
|--------------------|---------------|-------------------------------------------------|--------------------|--------------------------|
|
||||
| `classify` | Sequence-wise | probability vector of classes for each sequence | nan | nan |
|
||||
| `score` (see note) | Sequence-wise | reranker score for each sequence | `cross-encoder` | linear classifier |
|
||||
| `embed` | Sequence-wise | vector representations for each sequence | `bi-encoder` | cosine similarity |
|
||||
| `token_classify` | Token-wise | probability vector of classes for each token | nan | nan |
|
||||
| `token_embed` | Token-wise | vector representations for each token | `late-interaction` | late interaction(MaxSim) |
|
||||
The scoring models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`.
|
||||
|
||||
The score models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`.
|
||||
| Pooling Tasks | Granularity | Outputs | Score Types | scoring function |
|
||||
|-----------------------|---------------|----------------------------------------------|--------------------|--------------------------|
|
||||
| `classify` (see note) | Sequence-wise | reranker score for each sequence | `cross-encoder` | linear classifier |
|
||||
| `embed` | Sequence-wise | vector representations for each sequence | `bi-encoder` | cosine similarity |
|
||||
| `token_classify` | Token-wise | probability vector of classes for each token | nan | nan |
|
||||
| `token_embed` | Token-wise | vector representations for each token | `late-interaction` | late interaction(MaxSim) |
|
||||
|
||||
!!! note
|
||||
Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
|
||||
|
||||
### Pooling Usages
|
||||
|
||||
@@ -85,14 +86,16 @@ enabling the corresponding APIs.
|
||||
|
||||
### Offline APIs corresponding to pooling tasks
|
||||
|
||||
| Task | APIs |
|
||||
|------------------|----------------------------------------------------------------------------|
|
||||
| `embed` | `LLM.embed(...)`,`LLM.encode(..., pooling_task="embed")`, `LLM.score(...)` |
|
||||
| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")` |
|
||||
| `score` | `LLM.score(...)` |
|
||||
| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` |
|
||||
| `token_embed` | `LLM.encode(..., pooling_task="token_embed")`, `LLM.score(...)` |
|
||||
| `plugin` | `LLM.encode(..., pooling_task="plugin")` |
|
||||
| Task | APIs |
|
||||
|------------------|---------------------------------------------------------------------------------------|
|
||||
| `embed` | `LLM.embed(...)`, `LLM.encode(..., pooling_task="embed")`, `LLM.score(...)`(see note) |
|
||||
| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")`, `LLM.score(...)` |
|
||||
| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` |
|
||||
| `token_embed` | `LLM.encode(..., pooling_task="token_embed")`, `LLM.score(...)` |
|
||||
| `plugin` | `LLM.encode(..., pooling_task="plugin")` |
|
||||
|
||||
!!! note
|
||||
Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
|
||||
|
||||
### `LLM.classify`
|
||||
|
||||
@@ -206,11 +209,11 @@ If `--runner pooling` has been set (manually or automatically) but the model doe
|
||||
vLLM will attempt to automatically convert the model according to the architecture names
|
||||
shown in the table below.
|
||||
|
||||
| Architecture | `--convert` | Supported pooling tasks |
|
||||
| ----------------------------------------------- | ----------- | ------------------------------------- |
|
||||
| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` |
|
||||
| `*ForRewardModeling`, `*RewardModel` | `embed` | `token_embed`, `embed` |
|
||||
| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify`, `score` |
|
||||
| Architecture | `--convert` | Supported pooling tasks |
|
||||
|-------------------------------------------------|-------------|------------------------------|
|
||||
| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` |
|
||||
| `*ForRewardModeling`, `*RewardModel` | `embed` | `token_embed`, `embed` |
|
||||
| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify` |
|
||||
|
||||
!!! tip
|
||||
You can explicitly set `--convert <type>` to specify how to convert the model.
|
||||
@@ -251,3 +254,7 @@ Pooling models now default support all pooling, you can use it without any setti
|
||||
|
||||
- Extracting hidden states prefers using `token_embed` task.
|
||||
- Named Entity Recognition (NER) and reward models prefers using `token_classify` task.
|
||||
|
||||
### Score task
|
||||
|
||||
`score` task is deprecated and will be removed in v0.20. Please use `classify` instead. Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
|
||||
|
||||
@@ -17,6 +17,8 @@ The key distinction between (sequence) classification and token classification l
|
||||
|
||||
Many classification models support both (sequence) classification and token classification. For further details on token classification, please refer to [this page](token_classify.md).
|
||||
|
||||
Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled, please refer to [this page](scoring.md).
|
||||
|
||||
## Typical Use Cases
|
||||
|
||||
### Classification
|
||||
@@ -54,7 +56,7 @@ If your model is not in the above list, we will try to automatically convert the
|
||||
|
||||
Cross-encoder (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1. Most classification models can also be used as [cross-encoder models](scoring.md#cross-encoder-models). For more information on cross-encoder models, please refer to [this page](scoring.md).
|
||||
|
||||
--8<-- "docs/models/pooling_models/scoring.md:supported-score-models"
|
||||
--8<-- "docs/models/pooling_models/scoring.md:supported-cross-encoder-models"
|
||||
|
||||
### Reward Models
|
||||
|
||||
|
||||
@@ -10,11 +10,11 @@ The score models is designed to compute similarity scores between two input prom
|
||||
- Model Usage: Scoring
|
||||
- Pooling Task:
|
||||
|
||||
| Score Types | Pooling Tasks | scoring function |
|
||||
|--------------------|---------------|--------------------------|
|
||||
| `cross-encoder` | `score` | linear classifier |
|
||||
| `late-interaction` | `token_embed` | late interaction(MaxSim) |
|
||||
| `bi-encoder` | `embed` | cosine similarity |
|
||||
| Score Types | Pooling Tasks | scoring function |
|
||||
|--------------------|-----------------------|--------------------------|
|
||||
| `cross-encoder` | `classify` (see note) | linear classifier |
|
||||
| `late-interaction` | `token_embed` | late interaction(MaxSim) |
|
||||
| `bi-encoder` | `embed` | cosine similarity |
|
||||
|
||||
- Offline APIs:
|
||||
- `LLM.score`
|
||||
@@ -22,13 +22,16 @@ The score models is designed to compute similarity scores between two input prom
|
||||
- [Score API](scoring.md#score-api) (`/score`)
|
||||
- [Rerank API](scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
|
||||
|
||||
!!! note
|
||||
Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
|
||||
|
||||
## Supported Models
|
||||
|
||||
### Cross-encoder models
|
||||
|
||||
[Cross-encoder](https://www.sbert.net/examples/applications/cross-encoder/README.html) (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1.
|
||||
|
||||
--8<-- [start:supported-score-models]
|
||||
--8<-- [start:supported-cross-encoder-models]
|
||||
|
||||
#### Text-only Models
|
||||
|
||||
@@ -99,7 +102,7 @@ The score models is designed to compute similarity scores between two input prom
|
||||
vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
|
||||
```
|
||||
|
||||
--8<-- [end:supported-score-models]
|
||||
--8<-- [end:supported-cross-encoder-models]
|
||||
|
||||
### Late-interaction models
|
||||
|
||||
|
||||
@@ -74,7 +74,7 @@ def test_embed_dimensions(model_info: EmbedModelInfo):
|
||||
pooling_params.verify(model_config)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("task", ["score", "classify"])
|
||||
@pytest.mark.parametrize("task", ["classify"])
|
||||
def test_classify(task):
|
||||
model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS"))
|
||||
|
||||
|
||||
@@ -1435,10 +1435,10 @@ class ModelConfig:
|
||||
@property
|
||||
def score_type(self) -> ScoreType:
|
||||
"""
|
||||
Score API handles score/rerank for:
|
||||
- "score" task (score_type: cross-encoder models)
|
||||
- "embed" task (score_type: bi-encoder models)
|
||||
- "token_embed" task (score_type: late interaction models)
|
||||
Scoring API handles score/rerank for:\n
|
||||
- "classify" task (score_type: cross-encoder models)\n
|
||||
- "embed" task (score_type: bi-encoder models)\n
|
||||
- "token_embed" task (score_type: late interaction models)\n
|
||||
"""
|
||||
# fixme: self._model_info.score_type is the score type before
|
||||
# as_seq_cls_model, which is "bi-encoder", rather than the
|
||||
|
||||
@@ -1477,9 +1477,9 @@ class LLM:
|
||||
data_1 = data_1 * len(data_2)
|
||||
|
||||
if pooling_params is None:
|
||||
pooling_params = PoolingParams(task="score")
|
||||
pooling_params = PoolingParams(task="classify")
|
||||
elif pooling_params.task is None:
|
||||
pooling_params.task = "score"
|
||||
pooling_params.task = "classify"
|
||||
|
||||
pooling_params_list = list[PoolingParams]()
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ from fastapi.middleware.cors import CORSMiddleware
|
||||
from starlette.datastructures import State
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import load_chat_template
|
||||
@@ -155,7 +155,9 @@ async def build_async_engine_client_from_engine_args(
|
||||
|
||||
|
||||
def build_app(
|
||||
args: Namespace, supported_tasks: tuple["SupportedTask", ...] | None = None
|
||||
args: Namespace,
|
||||
supported_tasks: tuple["SupportedTask", ...] | None = None,
|
||||
model_config: ModelConfig | None = None,
|
||||
) -> FastAPI:
|
||||
if supported_tasks is None:
|
||||
warnings.warn(
|
||||
@@ -191,7 +193,7 @@ def build_app(
|
||||
attach_router as register_sagemaker_api_router,
|
||||
)
|
||||
|
||||
register_sagemaker_api_router(app, supported_tasks)
|
||||
register_sagemaker_api_router(app, supported_tasks, model_config)
|
||||
|
||||
if "generate" in supported_tasks:
|
||||
from vllm.entrypoints.openai.generate.api_router import (
|
||||
@@ -242,7 +244,7 @@ def build_app(
|
||||
if any(task in POOLING_TASKS for task in supported_tasks):
|
||||
from vllm.entrypoints.pooling import register_pooling_api_routers
|
||||
|
||||
register_pooling_api_routers(app, supported_tasks)
|
||||
register_pooling_api_routers(app, supported_tasks, model_config)
|
||||
|
||||
app.root_path = args.root_path
|
||||
app.add_middleware(
|
||||
@@ -583,8 +585,10 @@ async def build_and_serve(
|
||||
uvicorn_kwargs["log_config"] = log_config
|
||||
|
||||
supported_tasks = await engine_client.get_supported_tasks()
|
||||
model_config = engine_client.model_config
|
||||
|
||||
logger.info("Supported tasks: %s", supported_tasks)
|
||||
app = build_app(args, supported_tasks)
|
||||
app = build_app(args, supported_tasks, model_config)
|
||||
await init_app_state(engine_client, app.state, args, supported_tasks)
|
||||
|
||||
logger.info("Starting vLLM server on %s", listen_address)
|
||||
|
||||
@@ -5,6 +5,9 @@ from typing import TYPE_CHECKING
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.logger import init_logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from argparse import Namespace
|
||||
|
||||
@@ -17,9 +20,30 @@ else:
|
||||
RequestLogger = object
|
||||
SupportedTask = object
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def enable_scoring_api(
|
||||
supported_tasks: tuple["SupportedTask", ...],
|
||||
model_config: ModelConfig | None = None,
|
||||
) -> bool:
|
||||
if any(t in supported_tasks for t in ("embed", "token_embed")):
|
||||
return True
|
||||
|
||||
if model_config is not None and "classify" in supported_tasks:
|
||||
num_labels = getattr(model_config.hf_config, "num_labels", 0)
|
||||
if num_labels != 1:
|
||||
logger.debug_once("Score API is only enabled for num_labels == 1.")
|
||||
return False
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def register_pooling_api_routers(
|
||||
app: FastAPI, supported_tasks: tuple["SupportedTask", ...]
|
||||
app: FastAPI,
|
||||
supported_tasks: tuple["SupportedTask", ...],
|
||||
model_config: ModelConfig | None = None,
|
||||
):
|
||||
from vllm.entrypoints.pooling.pooling.api_router import router as pooling_router
|
||||
|
||||
@@ -37,11 +61,7 @@ def register_pooling_api_routers(
|
||||
|
||||
app.include_router(embed_router)
|
||||
|
||||
# Score API handles score/rerank for:
|
||||
# - "score" task (score_type: cross-encoder models)
|
||||
# - "embed" task (score_type: bi-encoder models)
|
||||
# - "token_embed" task (score_type: late interaction models)
|
||||
if any(t in supported_tasks for t in ("score", "embed", "token_embed")):
|
||||
if enable_scoring_api(supported_tasks, model_config):
|
||||
from vllm.entrypoints.pooling.score.api_router import router as score_router
|
||||
|
||||
app.include_router(score_router)
|
||||
@@ -61,6 +81,8 @@ def init_pooling_state(
|
||||
from vllm.entrypoints.pooling.score.serving import ServingScores
|
||||
from vllm.tasks import POOLING_TASKS
|
||||
|
||||
model_config = engine_client.model_config
|
||||
|
||||
resolved_chat_template = load_chat_template(args.chat_template)
|
||||
|
||||
state.serving_pooling = (
|
||||
@@ -102,10 +124,6 @@ def init_pooling_state(
|
||||
if "classify" in supported_tasks
|
||||
else None
|
||||
)
|
||||
# Score API handles score/rerank for:
|
||||
# - "score" task (score_type: cross-encoder models)
|
||||
# - "embed" task (score_type: bi-encoder models)
|
||||
# - "token_embed" task (score_type: late interaction models)
|
||||
state.serving_scores = (
|
||||
ServingScores(
|
||||
engine_client,
|
||||
@@ -114,6 +132,6 @@ def init_pooling_state(
|
||||
score_template=resolved_chat_template,
|
||||
log_error_stack=args.log_error_stack,
|
||||
)
|
||||
if any(t in supported_tasks for t in ("embed", "score", "token_embed"))
|
||||
if enable_scoring_api(supported_tasks, model_config)
|
||||
else None
|
||||
)
|
||||
|
||||
@@ -35,7 +35,7 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin):
|
||||
max_total_tokens_param="max_model_len",
|
||||
)
|
||||
|
||||
def to_pooling_params(self, task: PoolingTask = "score"):
|
||||
def to_pooling_params(self, task: PoolingTask = "classify"):
|
||||
return PoolingParams(
|
||||
task=task,
|
||||
use_activation=self.use_activation,
|
||||
@@ -111,7 +111,7 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
|
||||
max_total_tokens_param="max_model_len",
|
||||
)
|
||||
|
||||
def to_pooling_params(self, task: PoolingTask = "score"):
|
||||
def to_pooling_params(self, task: PoolingTask = "classify"):
|
||||
return PoolingParams(
|
||||
task=task,
|
||||
use_activation=self.use_activation,
|
||||
|
||||
@@ -413,7 +413,7 @@ class ServingScores(OpenAIServing):
|
||||
# Schedule the request and get the result generator.
|
||||
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
|
||||
|
||||
default_pooling_params = request.to_pooling_params("score")
|
||||
default_pooling_params = request.to_pooling_params("classify")
|
||||
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
request_id_item = f"{request_id}-{i}"
|
||||
|
||||
@@ -10,9 +10,11 @@ import pydantic
|
||||
from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.engine.serving import OpenAIServing
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.pooling import enable_scoring_api
|
||||
from vllm.entrypoints.pooling.base.serving import PoolingServing
|
||||
from vllm.entrypoints.serve.instrumentator.basic import base
|
||||
from vllm.entrypoints.serve.instrumentator.health import health
|
||||
@@ -25,7 +27,10 @@ GetHandlerFn = Callable[[Request], OpenAIServing | PoolingServing | None]
|
||||
EndpointFn = Callable[[RequestType, Request], Awaitable[Any]]
|
||||
|
||||
|
||||
def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]):
|
||||
def get_invocation_types(
|
||||
supported_tasks: tuple["SupportedTask", ...],
|
||||
model_config: ModelConfig | None = None,
|
||||
):
|
||||
# NOTE: Items defined earlier take higher priority
|
||||
INVOCATION_TYPES: list[tuple[RequestType, tuple[GetHandlerFn, EndpointFn]]] = []
|
||||
|
||||
@@ -70,7 +75,7 @@ def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]):
|
||||
(ClassificationRequest, (classify, create_classify)),
|
||||
]
|
||||
|
||||
if "score" in supported_tasks:
|
||||
if enable_scoring_api(supported_tasks, model_config):
|
||||
from vllm.entrypoints.pooling.score.api_router import do_rerank, rerank
|
||||
from vllm.entrypoints.pooling.score.protocol import RerankRequest
|
||||
|
||||
@@ -78,7 +83,6 @@ def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]):
|
||||
(RerankRequest, (rerank, do_rerank)),
|
||||
]
|
||||
|
||||
if "score" in supported_tasks or "embed" in supported_tasks:
|
||||
from vllm.entrypoints.pooling.score.api_router import create_score, score
|
||||
from vllm.entrypoints.pooling.score.protocol import ScoreRequest
|
||||
|
||||
@@ -97,11 +101,15 @@ def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]):
|
||||
return INVOCATION_TYPES
|
||||
|
||||
|
||||
def attach_router(app: FastAPI, supported_tasks: tuple["SupportedTask", ...]):
|
||||
def attach_router(
|
||||
app: FastAPI,
|
||||
supported_tasks: tuple["SupportedTask", ...],
|
||||
model_config: ModelConfig | None = None,
|
||||
):
|
||||
router = APIRouter()
|
||||
|
||||
# NOTE: Construct the TypeAdapters only once
|
||||
INVOCATION_TYPES = get_invocation_types(supported_tasks)
|
||||
INVOCATION_TYPES = get_invocation_types(supported_tasks, model_config)
|
||||
INVOCATION_VALIDATORS = [
|
||||
(pydantic.TypeAdapter(request_type), (get_handler, endpoint))
|
||||
for request_type, (get_handler, endpoint) in INVOCATION_TYPES
|
||||
|
||||
@@ -16,25 +16,22 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def get_classification_act_fn(
|
||||
def get_act_fn(
|
||||
config: PretrainedConfig,
|
||||
static_num_labels: bool = True,
|
||||
) -> "PoolerActivation":
|
||||
# get classification act_fn
|
||||
# Implement alignment with transformers ForSequenceClassificationLoss
|
||||
# https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92
|
||||
problem_type = getattr(config, "problem_type", "")
|
||||
if problem_type == "regression":
|
||||
return PoolerIdentity()
|
||||
if problem_type == "single_label_classification":
|
||||
return PoolerClassify()
|
||||
return PoolerClassify(static_num_labels=static_num_labels)
|
||||
if problem_type == "multi_label_classification":
|
||||
return PoolerMultiLabelClassify()
|
||||
|
||||
return PoolerClassify()
|
||||
|
||||
|
||||
def get_cross_encoder_act_fn(
|
||||
config: PretrainedConfig,
|
||||
) -> "PoolerActivation":
|
||||
# get cross_encoder act_fn
|
||||
function_name: str | None = None
|
||||
if (
|
||||
hasattr(config, "sentence_transformers")
|
||||
@@ -55,24 +52,16 @@ def get_cross_encoder_act_fn(
|
||||
fn = resolve_obj_by_qualname(function_name)()
|
||||
return PoolerActivation.wraps(fn)
|
||||
|
||||
return PoolerClassify()
|
||||
return PoolerClassify(static_num_labels=static_num_labels)
|
||||
|
||||
|
||||
def resolve_classifier_act_fn(
|
||||
model_config: ModelConfig,
|
||||
static_num_labels: bool = True,
|
||||
act_fn: "PoolerActivation | str | None" = None,
|
||||
act_fn: "PoolerActivation | None" = None,
|
||||
):
|
||||
if isinstance(act_fn, str):
|
||||
if act_fn == "classify":
|
||||
return get_classification_act_fn(model_config.hf_config)
|
||||
if act_fn == "score":
|
||||
return get_cross_encoder_act_fn(model_config.hf_config)
|
||||
|
||||
raise ValueError(f"act_fn [{act_fn=}] not supported.")
|
||||
|
||||
if act_fn is None:
|
||||
return PoolerClassify(static_num_labels=static_num_labels)
|
||||
return get_act_fn(model_config.hf_config, static_num_labels)
|
||||
|
||||
assert callable(act_fn)
|
||||
return act_fn
|
||||
@@ -97,9 +86,8 @@ class PoolerActivation(nn.Module, ABC):
|
||||
|
||||
def forward(self, pooled_data: _T) -> _T:
|
||||
# shape:
|
||||
# classify (& score) -> (batch_size, num_classes)
|
||||
# embed -> (batch_size, embedding_dim) or list(embedding_dim)
|
||||
# (batch_size, dimensions) or list(dimensions) if using MRL
|
||||
# classify -> (batch_size, num_classes)
|
||||
# embed -> (batch_size, embedding_size) or list(embedding_size)
|
||||
if isinstance(pooled_data, list):
|
||||
return [self.forward_chunk(data) for data in pooled_data]
|
||||
|
||||
|
||||
@@ -56,29 +56,31 @@ class EmbeddingPoolerHead(SequencePoolerHead):
|
||||
|
||||
if isinstance(pooled_data, list):
|
||||
pooled_data = torch.stack(pooled_data)
|
||||
# pooled_data shape: [batchsize, hidden_dimension]
|
||||
# pooled_data shape: [batchsize, hidden_size]
|
||||
|
||||
if self.head_dtype is not None:
|
||||
pooled_data = pooled_data.to(self.head_dtype)
|
||||
|
||||
# Apply ST projector
|
||||
if self.projector is not None:
|
||||
pooled_data = self.projector(pooled_data)
|
||||
# pooled_data shape: [batchsize, embedding_dimension]
|
||||
embeddings = self.projector(pooled_data)
|
||||
else:
|
||||
embeddings = pooled_data
|
||||
# embeddings shape: [batchsize, embedding_size]
|
||||
|
||||
# for matryoshka representation
|
||||
dimensions_list = [pooling_param.dimensions for pooling_param in pooling_params]
|
||||
if any(d is not None for d in dimensions_list):
|
||||
# change the output dimension
|
||||
assert len(pooled_data) == len(dimensions_list)
|
||||
if len(set(dimensions_list)) == 1 and not isinstance(pooled_data, list):
|
||||
assert len(embeddings) == len(dimensions_list)
|
||||
if len(set(dimensions_list)) == 1 and not isinstance(embeddings, list):
|
||||
# if all dimensions are the same
|
||||
d = dimensions_list[0]
|
||||
pooled_data = pooled_data[..., :d]
|
||||
embeddings = embeddings[..., :d]
|
||||
else:
|
||||
pooled_data = [
|
||||
embeddings = [
|
||||
vecs if d is None else vecs[..., :d]
|
||||
for vecs, d in zip(pooled_data, dimensions_list)
|
||||
for vecs, d in zip(embeddings, dimensions_list)
|
||||
]
|
||||
|
||||
# for normalize
|
||||
@@ -86,15 +88,15 @@ class EmbeddingPoolerHead(SequencePoolerHead):
|
||||
flags = [p.use_activation for p in pooling_params]
|
||||
if len(set(flags)) == 1:
|
||||
if flags[0]:
|
||||
pooled_data = self.activation(pooled_data)
|
||||
embeddings = self.activation(embeddings)
|
||||
else:
|
||||
pooled_data = [
|
||||
embeddings = [
|
||||
self.activation(vecs) if f else vecs
|
||||
for vecs, f in zip(pooled_data, flags)
|
||||
for vecs, f in zip(embeddings, flags)
|
||||
]
|
||||
|
||||
# pooled_data shape: [batchsize, embedding_dimension]
|
||||
return pooled_data
|
||||
# embeddings shape: [batchsize, embedding_size]
|
||||
return embeddings
|
||||
|
||||
|
||||
class ClassifierPoolerHead(SequencePoolerHead):
|
||||
@@ -113,7 +115,7 @@ class ClassifierPoolerHead(SequencePoolerHead):
|
||||
self.activation = activation
|
||||
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return {"classify", "score"}
|
||||
return {"classify"}
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@@ -131,21 +133,23 @@ class ClassifierPoolerHead(SequencePoolerHead):
|
||||
pooled_data = pooled_data.to(self.head_dtype)
|
||||
|
||||
if self.classifier is not None:
|
||||
pooled_data = self.classifier(pooled_data)
|
||||
# pooled_data shape: [batchsize, num_labels]
|
||||
logits = self.classifier(pooled_data)
|
||||
else:
|
||||
logits = pooled_data
|
||||
|
||||
# logits shape: [batchsize, num_labels]
|
||||
if self.logit_bias is not None:
|
||||
pooled_data -= self.logit_bias
|
||||
logits -= self.logit_bias
|
||||
|
||||
if self.activation is not None:
|
||||
flags = [p.use_activation for p in pooling_params]
|
||||
if len(set(flags)) == 1:
|
||||
pooled_data = self.activation(pooled_data) if flags[0] else pooled_data
|
||||
logits = self.activation(logits) if flags[0] else logits
|
||||
else:
|
||||
pooled_data = [
|
||||
logits = [
|
||||
self.activation(vecs) if f else vecs
|
||||
for vecs, f in zip(pooled_data, flags)
|
||||
for vecs, f in zip(logits, flags)
|
||||
]
|
||||
|
||||
# pooled_data shape: [batchsize, num_labels]
|
||||
return pooled_data
|
||||
# logits shape: [batchsize, num_labels]
|
||||
return logits
|
||||
|
||||
@@ -17,7 +17,7 @@ SequencePoolingMethodOutput: TypeAlias = torch.Tensor | list[torch.Tensor]
|
||||
|
||||
class SequencePoolingMethod(nn.Module, ABC):
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return {"token_embed", "token_classify", "embed", "classify", "score"}
|
||||
return {"token_embed", "token_classify", "embed", "classify"}
|
||||
|
||||
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
|
||||
return PoolingParamsUpdate()
|
||||
|
||||
@@ -108,7 +108,7 @@ def pooler_for_classify(
|
||||
*,
|
||||
pooling: SequencePoolingMethod | SequencePoolingFn | None = None,
|
||||
classifier: ClassifierFn | None = None,
|
||||
act_fn: PoolerActivation | str | None = None,
|
||||
act_fn: PoolerActivation | None = None,
|
||||
):
|
||||
if pooling is None:
|
||||
pooling = get_seq_pooling_method(pooler_config.get_seq_pooling_type())
|
||||
|
||||
@@ -52,13 +52,6 @@ class DispatchPooler(Pooler):
|
||||
pooler_config,
|
||||
pooling=pooling,
|
||||
classifier=classifier,
|
||||
act_fn="classify",
|
||||
),
|
||||
"score": pooler_for_classify(
|
||||
pooler_config,
|
||||
pooling=pooling,
|
||||
classifier=classifier,
|
||||
act_fn="score",
|
||||
),
|
||||
}
|
||||
)
|
||||
@@ -115,7 +108,7 @@ class DispatchPooler(Pooler):
|
||||
|
||||
class IdentityPooler(Pooler):
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
return {"plugin", "score"}
|
||||
return {"plugin"}
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@@ -68,22 +68,24 @@ class TokenEmbeddingPoolerHead(TokenPoolerHead):
|
||||
|
||||
if self.head_dtype is not None:
|
||||
pooled_data = pooled_data.to(self.head_dtype)
|
||||
# pooled_data shape: [n_tokens, hidden_dimension]
|
||||
# pooled_data shape: [n_tokens, hidden_size]
|
||||
|
||||
# Apply ST projector
|
||||
if self.projector is not None:
|
||||
pooled_data = self.projector(pooled_data)
|
||||
# pooled_data shape: [n_tokens, embedding_dimension]
|
||||
embeddings = self.projector(pooled_data)
|
||||
else:
|
||||
embeddings = pooled_data
|
||||
# embeddings shape: [n_tokens, embedding_size]
|
||||
|
||||
# for matryoshka representation
|
||||
pooled_data = pooled_data[..., : pooling_param.dimensions]
|
||||
embeddings = embeddings[..., : pooling_param.dimensions]
|
||||
|
||||
# for normalize
|
||||
if self.activation is not None and pooling_param.use_activation:
|
||||
pooled_data = self.activation(pooled_data)
|
||||
embeddings = self.activation(embeddings)
|
||||
|
||||
# pooled_data shape: [n_tokens, embedding_dimension]
|
||||
return pooled_data
|
||||
# embeddings shape: [n_tokens, embedding_size]
|
||||
return embeddings
|
||||
|
||||
|
||||
class TokenClassifierPoolerHead(TokenPoolerHead):
|
||||
@@ -118,16 +120,16 @@ class TokenClassifierPoolerHead(TokenPoolerHead):
|
||||
# hidden_states shape: [n_token, hidden_size]
|
||||
|
||||
if self.classifier is not None:
|
||||
scores = self.classifier(pooled_data)
|
||||
logits = self.classifier(pooled_data)
|
||||
else:
|
||||
scores = pooled_data
|
||||
# scores shape: [n_token, num_labels]
|
||||
logits = pooled_data
|
||||
# logits shape: [n_token, num_labels]
|
||||
|
||||
if self.logit_bias is not None:
|
||||
scores -= self.logit_bias
|
||||
logits -= self.logit_bias
|
||||
|
||||
if self.activation is not None and pooling_param.use_activation:
|
||||
scores = self.activation(scores)
|
||||
logits = self.activation(logits)
|
||||
|
||||
# scores shape: [n_token, num_labels]
|
||||
return scores
|
||||
# logits shape: [n_token, num_labels]
|
||||
return logits
|
||||
|
||||
@@ -116,7 +116,7 @@ def pooler_for_token_classify(
|
||||
*,
|
||||
pooling: TokenPoolingMethod | TokenPoolingFn | None = None,
|
||||
classifier: ClassifierFn | None = None,
|
||||
act_fn: PoolerActivation | str | None = None,
|
||||
act_fn: PoolerActivation | None = None,
|
||||
):
|
||||
if pooling is None:
|
||||
pooling = get_tok_pooling_method(pooler_config.get_tok_pooling_type())
|
||||
|
||||
@@ -194,18 +194,18 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
|
||||
[vllm.config.model.ModelConfig.score_type][]
|
||||
to use by default.
|
||||
|
||||
Score API handles score/rerank for:
|
||||
- "score" task (score_type: cross-encoder models)
|
||||
- "embed" task (score_type: bi-encoder models)
|
||||
- "token_embed" task (score_type: late interaction models)
|
||||
Scoring API handles score/rerank for:\n
|
||||
- "classify" task (score_type: cross-encoder models)\n
|
||||
- "embed" task (score_type: bi-encoder models)\n
|
||||
- "token_embed" task (score_type: late interaction models)\n
|
||||
|
||||
score_type defaults to bi-encoder, then the Score API uses the "embed" task.
|
||||
score_type defaults to bi-encoder, then the Score API uses the "embed" task.\n
|
||||
If you set score_type to cross-encoder via
|
||||
[vllm.model_executor.models.interfaces.SupportsCrossEncoding][],
|
||||
then the Score API uses the "score" task.
|
||||
then the Score API uses the "score" task.\n
|
||||
If you set score_type to late-interaction via
|
||||
[vllm.model_executor.models.interfaces.SupportsLateInteraction][],
|
||||
then the Score API uses the "token_embed" task.
|
||||
then the Score API uses the "token_embed" task.\n
|
||||
"""
|
||||
|
||||
pooler: Pooler
|
||||
|
||||
+11
-6
@@ -7,9 +7,12 @@ from typing import Any
|
||||
import msgspec
|
||||
|
||||
from vllm.config import ModelConfig, PoolerConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sampling_params import RequestOutputKind
|
||||
from vllm.tasks import PoolingTask
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class LateInteractionParams(
|
||||
msgspec.Struct,
|
||||
@@ -54,10 +57,6 @@ class PoolingParams(
|
||||
dimensions: int | None = None
|
||||
# --8<-- [end:embed-pooling-params]
|
||||
|
||||
## for classification, scoring and rerank
|
||||
# --8<-- [start:classify-pooling-params]
|
||||
# --8<-- [end:classify-pooling-params]
|
||||
|
||||
## for step pooling models
|
||||
step_tag_id: int | None = None
|
||||
returned_token_ids: list[int] | None = None
|
||||
@@ -79,7 +78,6 @@ class PoolingParams(
|
||||
return {
|
||||
"embed": ["dimensions", "use_activation"],
|
||||
"classify": ["use_activation"],
|
||||
"score": ["use_activation"],
|
||||
"token_embed": ["dimensions", "use_activation"],
|
||||
"token_classify": ["use_activation"],
|
||||
}
|
||||
@@ -89,6 +87,13 @@ class PoolingParams(
|
||||
return deepcopy(self)
|
||||
|
||||
def verify(self, model_config: ModelConfig) -> None:
|
||||
if self.task == "score":
|
||||
logger.warning_once(
|
||||
"`score` task is deprecated and will be removed in v0.20. "
|
||||
"Please use `classify` instead."
|
||||
)
|
||||
self.task = "classify"
|
||||
|
||||
# plugin task uses io_processor.parse_request to verify inputs,
|
||||
# skipping PoolingParams verify
|
||||
if self.task == "plugin":
|
||||
@@ -184,7 +189,7 @@ class PoolingParams(
|
||||
elif self.dimensions < 1:
|
||||
raise ValueError("Dimensions must be greater than 0")
|
||||
|
||||
elif self.task in ["classify", "score", "token_classify"]:
|
||||
elif self.task in ["classify", "token_classify"]:
|
||||
if self.use_activation is None:
|
||||
self.use_activation = True
|
||||
else:
|
||||
|
||||
@@ -8,7 +8,6 @@ GENERATION_TASKS: tuple[GenerationTask, ...] = get_args(GenerationTask)
|
||||
PoolingTask = Literal[
|
||||
"embed",
|
||||
"classify",
|
||||
"score",
|
||||
"token_embed",
|
||||
"token_classify",
|
||||
"plugin",
|
||||
@@ -16,10 +15,6 @@ PoolingTask = Literal[
|
||||
]
|
||||
POOLING_TASKS: tuple[PoolingTask, ...] = get_args(PoolingTask)
|
||||
|
||||
# Score API handles score/rerank for:
|
||||
# - "score" task (score_type: cross-encoder models)
|
||||
# - "embed" task (score_type: bi-encoder models)
|
||||
# - "token_embed" task (score_type: late interaction models)
|
||||
ScoreType = Literal["bi-encoder", "cross-encoder", "late-interaction"]
|
||||
|
||||
FrontendTask = Literal["render"]
|
||||
|
||||
@@ -2834,15 +2834,7 @@ class GPUModelRunner(
|
||||
if not is_pooling_model(model):
|
||||
return []
|
||||
|
||||
supported_tasks = list(model.pooler.get_supported_tasks())
|
||||
|
||||
if "score" in supported_tasks:
|
||||
num_labels = getattr(self.model_config.hf_config, "num_labels", 0)
|
||||
if num_labels != 1:
|
||||
supported_tasks.remove("score")
|
||||
logger.debug_once("Score API is only enabled for num_labels == 1.")
|
||||
|
||||
return supported_tasks
|
||||
return list(model.pooler.get_supported_tasks())
|
||||
|
||||
def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
|
||||
tasks = list[SupportedTask]()
|
||||
|
||||
Reference in New Issue
Block a user