mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[TRTLLM-8763][chore] Deprecate pybind based GuidedDecodingConfig usage in torch backend (#8717)
Signed-off-by: leslie-fang25 <leslief@nvidia.com>
This commit is contained in:
parent
fc3b6f5331
commit
451959c60d
@ -6,7 +6,9 @@ import llguidance.torch
|
||||
import torch
|
||||
import xgrammar
|
||||
|
||||
from ...bindings.executor import GuidedDecodingConfig, GuidedDecodingParams
|
||||
from tensorrt_llm.llmapi.llm_args import GuidedDecodingConfig
|
||||
|
||||
from ...bindings.executor import GuidedDecodingParams
|
||||
|
||||
|
||||
class GrammarMatcher(ABC):
|
||||
|
||||
@ -5,8 +5,10 @@ from typing import Iterable, List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
from tensorrt_llm.llmapi.llm_args import GuidedDecodingConfig
|
||||
|
||||
from ..._utils import nvtx_range
|
||||
from ...bindings.executor import GuidedDecodingConfig, GuidedDecodingParams
|
||||
from ...bindings.executor import GuidedDecodingParams
|
||||
from ...bindings.internal.batch_manager import LlmRequestType
|
||||
from ...logger import logger
|
||||
from ..hostfunc import hostfunc
|
||||
|
||||
@ -14,9 +14,9 @@ from strenum import StrEnum
|
||||
import tensorrt_llm
|
||||
from tensorrt_llm._torch.pyexecutor.resource_manager import ResourceManagerType
|
||||
from tensorrt_llm._utils import get_sm_version, mpi_disabled
|
||||
from tensorrt_llm.bindings.executor import GuidedDecodingConfig
|
||||
from tensorrt_llm.llmapi.llm_args import (CapacitySchedulerPolicy,
|
||||
ContextChunkingPolicy, LoadFormat,
|
||||
ContextChunkingPolicy,
|
||||
GuidedDecodingConfig, LoadFormat,
|
||||
TorchLlmArgs)
|
||||
from tensorrt_llm.llmapi.tokenizer import (TokenizerBase,
|
||||
_llguidance_tokenizer_info,
|
||||
|
||||
@ -44,8 +44,7 @@ from ..bindings.executor import (BatchingType as _BatchingType,
|
||||
KvCacheConfig as _KvCacheConfig,
|
||||
LookaheadDecodingConfig as _LookaheadDecodingConfig,
|
||||
PeftCacheConfig as _PeftCacheConfig,
|
||||
SchedulerConfig as _SchedulerConfig,
|
||||
GuidedDecodingConfig as _GuidedDecodingConfig) # isort: skip
|
||||
SchedulerConfig as _SchedulerConfig) # isort: skip
|
||||
# isort: on
|
||||
|
||||
# yapf: enable
|
||||
@ -164,6 +163,26 @@ class CudaGraphConfig(StrictBaseModel):
|
||||
return batch_sizes
|
||||
|
||||
|
||||
class GuidedDecodingConfig(StrictBaseModel):
|
||||
|
||||
class GuidedDecodingBackend(Enum):
|
||||
XGRAMMAR = 0
|
||||
LLGUIDANCE = 1
|
||||
|
||||
backend: GuidedDecodingBackend = Field(
|
||||
default=GuidedDecodingBackend.XGRAMMAR,
|
||||
description="The backend for guided decoding config.")
|
||||
encoded_vocab: Optional[List[str]] = Field(
|
||||
default=None,
|
||||
description="The encoded vocab for guided decoding config.")
|
||||
tokenizer_str: Optional[str] = Field(
|
||||
default=None,
|
||||
description="The tokenizer string for guided decoding config.")
|
||||
stop_token_ids: Optional[List[int]] = Field(
|
||||
default=None,
|
||||
description="The stop token ids for guided decoding config.")
|
||||
|
||||
|
||||
class BaseSparseAttentionConfig(StrictBaseModel):
|
||||
"""
|
||||
Configuration for sparse attention.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user