mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* v1.5 Signed-off-by: wili-65535 <wili-65535@users.noreply.github.com> v1.5.4 Add back draft_overhead to spec dec stats Signed-off-by: Thor Johnsen <41591019+thorjohnsen@users.noreply.github.com> * v1.5.5: fix CI error Signed-off-by: wili-65535 <wili-65535@users.noreply.github.com> * v1.6: fix CI error 8196 > 8192 Signed-off-by: wili-65535 <wili-65535@users.noreply.github.com> * Address reviewer concerns Signed-off-by: Thor Johnsen <41591019+thorjohnsen@users.noreply.github.com> * Address reviewer concerns Signed-off-by: Thor Johnsen <41591019+thorjohnsen@users.noreply.github.com> * precommit run Signed-off-by: Thor Johnsen <41591019+thorjohnsen@users.noreply.github.com> * v2.0: Address reviewer concerns Signed-off-by: wili-65535 <wili-65535@users.noreply.github.com> * v2.1: add fix from wili Signed-off-by: wili-65535 <wili-65535@users.noreply.github.com> * Revert changes that require use of TypeAlias because that requires python version >= 3.10 Signed-off-by: Thor Johnsen <41591019+thorjohnsen@users.noreply.github.com> --------- Signed-off-by: Thor Johnsen <41591019+thorjohnsen@users.noreply.github.com> Signed-off-by: wili-65535 <wili-65535@users.noreply.github.com> Co-authored-by: wili-65535 <wili-65535@users.noreply.github.com>
45 lines
1.5 KiB
Python
45 lines
1.5 KiB
Python
from ..disaggregated_params import DisaggregatedParams
|
|
from ..executor import CompletionOutput, RequestError
|
|
from ..sampling_params import GuidedDecodingParams, SamplingParams
|
|
from .build_cache import BuildCacheConfig
|
|
from .llm import LLM, RequestOutput
|
|
from .llm_args import (BatchingType, CacheTransceiverConfig, CalibConfig,
|
|
CapacitySchedulerPolicy, ContextChunkingPolicy,
|
|
DynamicBatchConfig, EagleDecodingConfig,
|
|
ExtendedRuntimePerfKnobConfig, KvCacheConfig,
|
|
LookaheadDecodingConfig, MedusaDecodingConfig,
|
|
MTPDecodingConfig, NGramDecodingConfig, SchedulerConfig)
|
|
from .llm_utils import (BuildConfig, KvCacheRetentionConfig, QuantAlgo,
|
|
QuantConfig)
|
|
from .mpi_session import MpiCommSession
|
|
|
|
__all__ = [
|
|
'LLM',
|
|
'CompletionOutput',
|
|
'RequestOutput',
|
|
'GuidedDecodingParams',
|
|
'SamplingParams',
|
|
'DisaggregatedParams',
|
|
'KvCacheConfig',
|
|
'KvCacheRetentionConfig',
|
|
'LookaheadDecodingConfig',
|
|
'MedusaDecodingConfig',
|
|
'EagleDecodingConfig',
|
|
'MTPDecodingConfig',
|
|
'SchedulerConfig',
|
|
'CapacitySchedulerPolicy',
|
|
'BuildConfig',
|
|
'QuantConfig',
|
|
'QuantAlgo',
|
|
'CalibConfig',
|
|
'BuildCacheConfig',
|
|
'RequestError',
|
|
'MpiCommSession',
|
|
'ExtendedRuntimePerfKnobConfig',
|
|
'BatchingType',
|
|
'ContextChunkingPolicy',
|
|
'DynamicBatchConfig',
|
|
'CacheTransceiverConfig',
|
|
'NGramDecodingConfig',
|
|
]
|