TensorRT-LLMs/tensorrt_llm/llmapi/__init__.py
Bo Li a66eeab537
[TRTLLM-9805][feat] Skip Softmax Attention. (#9821)
Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
Signed-off-by: Tian Zheng <29906817+Tom-Zheng@users.noreply.github.com>
Co-authored-by: Tian Zheng <29906817+Tom-Zheng@users.noreply.github.com>
2025-12-21 02:52:42 -05:00

71 lines
2.4 KiB
Python

from .._torch.async_llm import AsyncLLM
from ..disaggregated_params import DisaggregatedParams
from ..executor import CompletionOutput, LoRARequest, RequestError
from ..sampling_params import GuidedDecodingParams, SamplingParams
from .build_cache import BuildCacheConfig
from .llm import LLM, RequestOutput
# yapf: disable
from .llm_args import (AttentionDpConfig, AutoDecodingConfig, BatchingType,
CacheTransceiverConfig, CalibConfig,
CapacitySchedulerPolicy, ContextChunkingPolicy,
CudaGraphConfig, DeepSeekSparseAttentionConfig,
DraftTargetDecodingConfig, DynamicBatchConfig,
EagleDecodingConfig, ExtendedRuntimePerfKnobConfig,
KvCacheConfig, LlmArgs, LookaheadDecodingConfig,
MedusaDecodingConfig, MoeConfig, MTPDecodingConfig,
NGramDecodingConfig, RocketSparseAttentionConfig,
SaveHiddenStatesDecodingConfig, SchedulerConfig,
SkipSoftmaxAttentionConfig, TorchCompileConfig,
TorchLlmArgs, TrtLlmArgs, UserProvidedDecodingConfig)
from .llm_utils import (BuildConfig, KvCacheRetentionConfig, QuantAlgo,
QuantConfig)
from .mm_encoder import MultimodalEncoder
from .mpi_session import MpiCommSession
__all__ = [
'LLM',
'AsyncLLM',
'MultimodalEncoder',
'CompletionOutput',
'RequestOutput',
'GuidedDecodingParams',
'SamplingParams',
'DisaggregatedParams',
'KvCacheConfig',
'KvCacheRetentionConfig',
'CudaGraphConfig',
'MoeConfig',
'LookaheadDecodingConfig',
'MedusaDecodingConfig',
'EagleDecodingConfig',
'MTPDecodingConfig',
'SchedulerConfig',
'CapacitySchedulerPolicy',
'BuildConfig',
'QuantConfig',
'QuantAlgo',
'CalibConfig',
'BuildCacheConfig',
'RequestError',
'MpiCommSession',
'ExtendedRuntimePerfKnobConfig',
'BatchingType',
'ContextChunkingPolicy',
'DynamicBatchConfig',
'CacheTransceiverConfig',
'NGramDecodingConfig',
'UserProvidedDecodingConfig',
'TorchCompileConfig',
'DraftTargetDecodingConfig',
'LlmArgs',
'TorchLlmArgs',
'TrtLlmArgs',
'AutoDecodingConfig',
'AttentionDpConfig',
'LoRARequest',
'SaveHiddenStatesDecodingConfig',
'RocketSparseAttentionConfig',
'DeepSeekSparseAttentionConfig',
'SkipSoftmaxAttentionConfig',
]