TensorRT-LLMs/tensorrt_llm/llmapi/__init__.py
Simeng Liu 8cf3faa26a
[feat] Auto-enable ngram with concurrency <= 32. (#6232)
Signed-off-by: Simeng Liu <simengl@nvidia.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
Signed-off-by: Mike Iovine <mike.iovine7@gmail.com>
Co-authored-by: Mike Iovine <miovine@nvidia.com>
Co-authored-by: Mike Iovine <mike.iovine7@gmail.com>
2025-07-31 18:45:51 -04:00

58 lines
1.9 KiB
Python

from ..disaggregated_params import DisaggregatedParams
from ..executor import CompletionOutput, RequestError
from ..sampling_params import GuidedDecodingParams, SamplingParams
from .build_cache import BuildCacheConfig
from .llm import LLM, RequestOutput
# yapf: disable
from .llm_args import (AutoDecodingConfig, BatchingType, CacheTransceiverConfig,
CalibConfig, CapacitySchedulerPolicy,
ContextChunkingPolicy, CudaGraphConfig,
DraftTargetDecodingConfig, DynamicBatchConfig,
EagleDecodingConfig, ExtendedRuntimePerfKnobConfig,
KvCacheConfig, LlmArgs, LookaheadDecodingConfig,
MedusaDecodingConfig, MoeConfig, MTPDecodingConfig,
NGramDecodingConfig, SchedulerConfig, TorchCompileConfig,
TorchLlmArgs, TrtLlmArgs, UserProvidedDecodingConfig)
from .llm_utils import (BuildConfig, KvCacheRetentionConfig, QuantAlgo,
QuantConfig)
from .mpi_session import MpiCommSession
__all__ = [
'LLM',
'CompletionOutput',
'RequestOutput',
'GuidedDecodingParams',
'SamplingParams',
'DisaggregatedParams',
'KvCacheConfig',
'KvCacheRetentionConfig',
'CudaGraphConfig',
'MoeConfig',
'LookaheadDecodingConfig',
'MedusaDecodingConfig',
'EagleDecodingConfig',
'MTPDecodingConfig',
'SchedulerConfig',
'CapacitySchedulerPolicy',
'BuildConfig',
'QuantConfig',
'QuantAlgo',
'CalibConfig',
'BuildCacheConfig',
'RequestError',
'MpiCommSession',
'ExtendedRuntimePerfKnobConfig',
'BatchingType',
'ContextChunkingPolicy',
'DynamicBatchConfig',
'CacheTransceiverConfig',
'NGramDecodingConfig',
'UserProvidedDecodingConfig',
'TorchCompileConfig',
'DraftTargetDecodingConfig',
'LlmArgs',
'TorchLlmArgs',
'TrtLlmArgs',
'AutoDecodingConfig',
]