[TRTLLM-5208][BREAKING CHANGE] chore: make pytorch LLM the default (#5312)

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
Yan Chunwei 2025-06-20 03:01:10 +08:00 committed by GitHub
parent 113f6fbadd
commit 9bd42ecf9b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
89 changed files with 331 additions and 262 deletions

View File

@ -11,7 +11,7 @@ The PyTorch backend of TensorRT-LLM is available in version 0.17 and later. You
## Quick Start
Here is a simple example to show how to use `tensorrt_llm._torch.LLM` API with Llama model.
Here is a simple example to show how to use `tensorrt_llm.LLM` API with Llama model.
```{literalinclude} ../../examples/pytorch/quickstart.py
:language: python
@ -24,7 +24,7 @@ The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized
which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
```python
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8')
llm.generate("Hello, my name is")
```
@ -44,7 +44,7 @@ The PyTorch backend supports most of the sampling features that are supported on
In order to use this feature, it is necessary to enable option `enable_trtllm_sampler` in the `LLM` class, and pass a `SamplingParams` object with the desired options as well. The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:
```python
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
enable_trtllm_sampler=True)
sampling_params = SamplingParams(

View File

@ -186,7 +186,7 @@ __all__ = [
Alternatively, you can register the new model as an out-of-tree model, so that you can use the new model without touching the TensorRT-LLM codebase. To do so, place `modeling_mymodel.py` (and potentially `configuration_mymodel.py`) in your working directory, and import the modeling code in your script:
```python
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
import modeling_mymodel
def main():

View File

@ -5,10 +5,10 @@ Besides TensorRT, PyTorch can also serve as the backend for TensorRT-LLM. This d
## Top Level API
The interface for PyTorch backend is `tensorrt._torch.LLM`.
The interface for PyTorch backend is `tensorrt_llm.LLM`.
```python
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
llm = LLM(model=<path_to_llama_from_hf>)
```

View File

@ -5,7 +5,8 @@ import click
import colorama
from transformers import AutoTokenizer, PreTrainedTokenizer
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
class LlmConsole(code.InteractiveConsole):

View File

@ -18,8 +18,9 @@ import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response, StreamingResponse
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.executor import CppExecutorError, RequestError
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
TIMEOUT_KEEP_ALIVE = 5 # seconds.

View File

@ -7,11 +7,12 @@ from typing import List, Optional, Union
import torch
from simple_config import SimpleConfig
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm._torch.auto_deploy.models import ModelFactoryRegistry
from tensorrt_llm._torch.auto_deploy.shim import DemoLLM
from tensorrt_llm._torch.auto_deploy.utils.benchmark import benchmark, store_benchmark_results
from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
from tensorrt_llm.llmapi.llm import LLM, RequestOutput
from tensorrt_llm.llmapi.llm import RequestOutput
from tensorrt_llm.llmapi.llm_args import TorchCompileConfig
from tensorrt_llm.sampling_params import SamplingParams

View File

@ -1,5 +1,6 @@
### Automatic Parallelism with LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
def main():

View File

@ -1,7 +1,7 @@
### Generate Text Using Eagle2 Decoding
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
SamplingParams)

View File

@ -1,8 +1,8 @@
### Generate Text Using Eagle Decoding
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
SamplingParams)
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
def main():

View File

@ -1,5 +1,6 @@
### Generate text with guided decoding
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import GuidedDecodingParams

View File

@ -1,7 +1,8 @@
### Generate text
import tempfile
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
def main():

View File

@ -1,7 +1,8 @@
### Generate Text Asynchronously
import asyncio
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
def main():

View File

@ -1,7 +1,8 @@
### Generate Text in Streaming
import asyncio
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
def main():

View File

@ -1,7 +1,8 @@
### Generate text with customization
import tempfile
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
def main():

View File

@ -1,5 +1,6 @@
### Distributed LLM Generation
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
def main():

View File

@ -1,6 +1,7 @@
### Get KV Cache Events
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import KvCacheConfig

View File

@ -3,7 +3,7 @@ from typing import List, Optional
import torch
from tensorrt_llm import LLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.sampling_params import (BatchedLogitsProcessor,
LogitsProcessor, SamplingParams)

View File

@ -1,6 +1,6 @@
### Generate Text Using Lookahead Decoding
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
LookaheadDecodingConfig, SamplingParams)

View File

@ -2,8 +2,8 @@
import argparse
from pathlib import Path
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
MedusaDecodingConfig, SamplingParams)

View File

@ -1,8 +1,9 @@
### Generate text with multiple LoRA adapters
from huggingface_hub import snapshot_download
from tensorrt_llm import LLM, BuildConfig
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.executor import LoRARequest
from tensorrt_llm.llmapi import BuildConfig
from tensorrt_llm.lora_manager import LoraConfig

View File

@ -3,7 +3,8 @@ import logging
import torch
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import CalibConfig, QuantAlgo, QuantConfig
major, minor = torch.cuda.get_device_capability()

View File

@ -1,4 +1,5 @@
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
def main():

View File

@ -32,12 +32,12 @@ from packaging.version import parse
from tqdm import tqdm
import tensorrt_llm
from tensorrt_llm._torch import LLM as TORCH_LLM
from tensorrt_llm import LLM as TORCH_LLM
from tensorrt_llm._tensorrt_engine import LLM as TRT_LLM
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
from tensorrt_llm.bindings.executor import DecodingConfig
from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
from tensorrt_llm.llmapi import RequestOutput, SamplingParams
from tensorrt_llm.llmapi.llm import LLM as TRT_LLM
logger = logging.getLogger(__name__)

View File

@ -1,6 +1,6 @@
import modeling_opt # noqa
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
def main():

View File

@ -1,5 +1,4 @@
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
def main():

View File

@ -1,7 +1,6 @@
import argparse
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import (DraftTargetDecodingConfig, EagleDecodingConfig,
KvCacheConfig, MTPDecodingConfig,
NGramDecodingConfig, TorchCompileConfig)

View File

@ -6,8 +6,7 @@ from difflib import SequenceMatcher
import torch
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig

View File

@ -46,6 +46,7 @@ from .builder import BuildConfig, Builder, BuilderConfig, build
from .disaggregated_params import DisaggregatedParams
from .functional import Tensor, constant
from .llmapi import LLM, LlmArgs
from .llmapi.llm_args import LlmArgs, TorchLlmArgs, TrtLlmArgs
from .logger import logger
from .mapping import Mapping
from .models.automodel import AutoConfig, AutoModelForCausalLM
@ -98,6 +99,8 @@ __all__ = [
'tools',
'LLM',
'LlmArgs',
'TorchLlmArgs',
'TrtLlmArgs',
'SamplingParams',
'DisaggregatedParams',
'KvCacheConfig',

View File

@ -0,0 +1,3 @@
from tensorrt_llm.llmapi.llm import _TrtLLM as LLM
__all__ = ['LLM']

View File

@ -1,3 +1,4 @@
from .llm import LLM
from .model_config import MoeLoadBalancerConfig
__all__ = ["LLM"]
__all__ = ["LLM", "MoeLoadBalancerConfig"]

View File

@ -10,11 +10,12 @@ import torch
import torch.multiprocessing as mp
from transformers import PreTrainedTokenizerBase
from ...._tensorrt_engine import LLM
from ....executor import GenerationExecutor
from ....executor.request import GenerationRequest
from ....executor.result import CompletionOutput, GenerationResult
from ....inputs.registry import create_input_processor
from ....llmapi.llm import LLM, RequestOutput
from ....llmapi.llm import RequestOutput
from ....llmapi.llm_args import _AutoDeployLlmArgs
from ....llmapi.tokenizer import TokenizerBase
from ....sampling_params import SamplingParams

View File

@ -1,3 +1,13 @@
from tensorrt_llm.llmapi.llm import _TorchLLM as LLM
from tensorrt_llm.llmapi.llm import _TorchLLM
class LLM(_TorchLLM):
def __init__(self, *args, **kwargs):
raise ImportError(
"_torch.llm is deprecated, please use `from tensorrt_llm import LLM` directly"
)
# Keep the LLM class to guide the users to use the default LLM class
__all__ = ['LLM']

View File

@ -10,13 +10,14 @@ import yaml
from click_option_group import (MutuallyExclusiveOptionGroup, OptionGroup,
optgroup)
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.bench.benchmark.utils.asynchronous import async_benchmark
from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
from tensorrt_llm.bench.benchmark.utils.processes import IterationWriter
from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy
from tensorrt_llm.llmapi import CapacitySchedulerPolicy
from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode
# isort: off

View File

@ -17,7 +17,8 @@ from tensorrt_llm.bench.build.build import get_model_config
from tensorrt_llm.bench.benchmark.utils.general import (
get_settings_from_engine, get_settings)
# isort: on
from tensorrt_llm._torch.llm import LLM as PyTorchLLM
from tensorrt_llm import LLM as PyTorchLLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
@ -25,7 +26,7 @@ from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
initialize_tokenizer,
update_metadata_for_multimodal)
from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy
from tensorrt_llm.llmapi import CapacitySchedulerPolicy
from tensorrt_llm.logger import logger
from tensorrt_llm.sampling_params import SamplingParams

View File

@ -9,7 +9,8 @@ from typing import List, Optional, Set, Tuple
from zmq import PUSH
from zmq.asyncio import Context
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.bench.dataclasses.general import InferenceRequest
from tensorrt_llm.bench.dataclasses.reporting import PerfItemTuple, StatsKeeper
from tensorrt_llm.executor.postproc_worker import PostprocParams

View File

@ -9,7 +9,7 @@ from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
from tensorrt_llm.bench.utils.data import create_dataset_from_stream, initialize_tokenizer
from tensorrt_llm.bench.utils import VALID_QUANT_ALGOS
from tensorrt_llm.builder import BuildConfig
from tensorrt_llm.llmapi import LLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi.llm_utils import QuantConfig
from tensorrt_llm.logger import logger
from tensorrt_llm.quantization.mode import QuantAlgo

View File

@ -18,10 +18,11 @@ import click
import tensorrt_llm.profiler as profiler
from .._torch.llm import LLM as PyTorchLLM
from .. import LLM as PyTorchLLM
from .._tensorrt_engine import LLM
from ..evaluate import (GSM8K, MMLU, CnnDailymail, GPQADiamond, GPQAExtended,
GPQAMain, JsonModeEval)
from ..llmapi import LLM, BuildConfig, KvCacheConfig
from ..llmapi import BuildConfig, KvCacheConfig
from ..llmapi.llm_utils import update_llm_args_with_extra_options
from ..logger import logger, severity_map

View File

@ -11,10 +11,11 @@ import yaml
from strenum import StrEnum
from torch.cuda import device_count
from tensorrt_llm._torch.llm import LLM as PyTorchLLM
from tensorrt_llm import LLM as PyTorchLLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm._utils import mpi_rank
from tensorrt_llm.executor.utils import LlmLauncherEnvs
from tensorrt_llm.llmapi import (LLM, BuildConfig, CapacitySchedulerPolicy,
from tensorrt_llm.llmapi import (BuildConfig, CapacitySchedulerPolicy,
DynamicBatchConfig, KvCacheConfig,
SchedulerConfig)
from tensorrt_llm.llmapi.disagg_utils import (CtxGenServerConfig,

View File

@ -18,8 +18,9 @@ import click
import datasets
import evaluate
from .._torch import LLM as PyTorchLLM
from ..llmapi import LLM, RequestOutput
from .. import LLM as PyTorchLLM
from .._tensorrt_engine import LLM
from ..llmapi import RequestOutput
from ..logger import logger
from ..sampling_params import SamplingParams
from .interface import Evaluator

View File

@ -19,8 +19,9 @@ import click
import datasets
import numpy as np
from .._torch import LLM as PyTorchLLM
from ..llmapi import LLM, RequestOutput
from .. import LLM as PyTorchLLM
from .._tensorrt_engine import LLM
from ..llmapi import RequestOutput
from ..logger import logger
from ..sampling_params import GuidedDecodingParams, SamplingParams
from .interface import Evaluator

View File

@ -28,8 +28,9 @@ try:
except ImportError:
TemplateLM = object
from .._torch import LLM as PyTorchLLM
from ..llmapi import LLM, RequestOutput
from .. import LLM as PyTorchLLM
from .._tensorrt_engine import LLM
from ..llmapi import RequestOutput
from ..logger import logger
from ..sampling_params import SamplingParams
from .interface import Evaluator

View File

@ -40,8 +40,9 @@ import click
import numpy as np
import pandas as pd
from .._torch import LLM as PyTorchLLM
from ..llmapi import LLM, RequestOutput
from .. import LLM as PyTorchLLM
from .._tensorrt_engine import LLM
from ..llmapi import RequestOutput
from ..logger import logger
from ..sampling_params import SamplingParams
from .interface import Evaluator

View File

@ -2,7 +2,7 @@ from ..disaggregated_params import DisaggregatedParams
from ..executor import CompletionOutput, RequestError
from ..sampling_params import GuidedDecodingParams, SamplingParams
from .build_cache import BuildCacheConfig
from .llm import LLM, RequestOutput, _TorchLLM, _TrtLLM
from .llm import LLM, RequestOutput
# yapf: disable
from .llm_args import (BatchingType, CacheTransceiverConfig, CalibConfig,
CapacitySchedulerPolicy, ContextChunkingPolicy,
@ -50,6 +50,4 @@ __all__ = [
'LlmArgs',
'TorchLlmArgs',
'TrtLlmArgs',
'_TrtLLM',
'_TorchLLM',
]

View File

@ -97,6 +97,7 @@ TORCH_LLM_DOCSTRING = TORCH_LLMARGS_EXPLICIT_DOCSTRING + """
Attributes:
tokenizer (tensorrt_llm.llmapi.tokenizer.TokenizerBase, optional): The tokenizer loaded by LLM instance, if any.
llm_id (str): The unique ID of the LLM instance.
"""
@ -883,6 +884,9 @@ class _TorchLLM(BaseLLM):
# TODO: deprecate backend in LLM kwargs
kwargs.pop("backend", None)
# Validate that users don't pass TrtLlmArgs-specific arguments
self._validate_args_for_torch_backend(kwargs)
super().__init__(model,
tokenizer,
tokenizer_mode,
@ -895,8 +899,28 @@ class _TorchLLM(BaseLLM):
backend='pytorch',
**kwargs)
def _validate_args_for_torch_backend(self, kwargs: dict) -> None:
"""Validate that users don't pass TrtLlmArgs-specific arguments when using PyTorch backend.
"""
trtllm_fields = set(TrtLlmArgs.model_fields.keys())
torchllm_fields = set(TorchLlmArgs.model_fields.keys())
class LLM(_TrtLLM):
trtllm_specific_fields = trtllm_fields - torchllm_fields
# Check if any TrtLlmArgs-specific arguments are passed
trtllm_specific_args = []
for key in kwargs:
if key in trtllm_specific_fields:
trtllm_specific_args.append(key)
if trtllm_specific_args:
raise ValueError(
f"The following arguments are specific to TensorRT backend and cannot be used with PyTorch backend: {trtllm_specific_args}.\n"
f"Please use 'from tensorrt_llm._tensorrt_engine import LLM' instead to use the TensorRT backend."
)
class LLM(_TorchLLM):
def __init__(self,
model: Union[str, Path],
@ -915,15 +939,13 @@ class LLM(_TrtLLM):
revision, tokenizer_revision, **kwargs)
_LLM_REPR = "TrtLLM"
_LLM_REPR = "TorchLLM"
# sphinx will ignore the LLM's docstring if it is not explicitly set
LLM.__doc__ = \
f"""LLM class is the main class for running a LLM model.
This class is an alias of {_LLM_REPR}. You can switch between the TensorRT backend
and the PyTorch backend by setting the TLLM_USE_TRT_ENGINE environment to 1 or 0.
The default backend is the TensorRT backend.
This class is an alias of {_LLM_REPR}.
Parameters:
""" + TRT_LLM_DOCSTRING
""" + TORCH_LLM_DOCSTRING

View File

@ -1591,9 +1591,6 @@ class TrtLlmArgs(BaseLlmArgs):
return self
LlmArgs = TrtLlmArgs
class LoadFormat(Enum):
AUTO = 0
# Initialize all weights randomly.
@ -1663,7 +1660,10 @@ class TorchLlmArgs(BaseLlmArgs):
moe_load_balancer: Optional[Union[object, str]] = Field(
default=None,
description="Configuration for MoE load balancing.",
json_schema_extra={"type": "Union[MoeLoadBalancerConfig, str]"})
json_schema_extra={
"type":
"Union[tensorrt_llm._torch.model_config.MoeLoadBalancerConfig, str, None]"
})
attn_backend: str = Field(default='TRTLLM',
description="Attention backend to use.")
@ -2081,6 +2081,8 @@ def get_model_format(model_dir: str) -> _ModelFormatKind:
return model_format
LlmArgs = TorchLlmArgs
TRT_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TrtLlmArgs,
indent=' ' * 4)
TORCH_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TorchLlmArgs,

View File

@ -4,8 +4,8 @@ from typing import Callable
import openai
from transformers import AutoTokenizer
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.executor import GenerationExecutor
from tensorrt_llm.llmapi.llm import LLM
from tensorrt_llm.llmapi.llm_args import KvCacheConfig
from tensorrt_llm.sampling_params import SamplingParams

View File

@ -14,12 +14,12 @@ from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse, Response, StreamingResponse
from transformers import AutoConfig, AutoProcessor
from tensorrt_llm._tensorrt_engine import LLM
# yapf: disable
from tensorrt_llm.executor import CppExecutorError
from tensorrt_llm.executor.postproc_worker import PostprocParams
from tensorrt_llm.inputs import prompt_inputs
from tensorrt_llm.inputs.utils import ConversationMessage, apply_chat_template
from tensorrt_llm.llmapi import LLM
from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
from tensorrt_llm.llmapi.disagg_utils import MetadataServerConfig, ServerRole
from tensorrt_llm.llmapi.llm import RequestOutput

View File

@ -23,10 +23,11 @@ import scipy
import yaml
import tensorrt_llm.evaluate
from tensorrt_llm._torch import LLM as PyTorchLLM
from tensorrt_llm import LLM as PyTorchLLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm._torch.speculative import SpecConfig
from tensorrt_llm.builder import BuildConfig
from tensorrt_llm.llmapi import LLM, SamplingParams
from tensorrt_llm.llmapi import SamplingParams
from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig
from tensorrt_llm.logger import logger
from tensorrt_llm.models.modeling_utils import QuantConfig

View File

@ -83,7 +83,6 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
yaml.dump(gen_server_config, f)
args = LlmArgs.from_kwargs(model=model_name,
backend="pytorch",
tensor_parallel_size=tensor_parallel_size)
trtllm_serve_path = "trtllm-serve"

View File

@ -14,7 +14,8 @@
# limitations under the License.
import pytest
from tensorrt_llm.llmapi import LLM, EagleDecodingConfig, KvCacheConfig
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
from tensorrt_llm.models.modeling_utils import QuantConfig
from tensorrt_llm.quantization import QuantAlgo

View File

@ -14,7 +14,7 @@
# limitations under the License.
import pytest
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig
from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
MTPDecodingConfig, NGramDecodingConfig,

View File

@ -9,8 +9,7 @@ from defs.conftest import skip_no_hopper
from mpi4py import MPI
from mpi4py.futures import MPIPoolExecutor
from tensorrt_llm import DisaggregatedParams, SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, DisaggregatedParams, SamplingParams
from tensorrt_llm._utils import set_mpi_comm
from tensorrt_llm.llmapi import KvCacheConfig, MpiCommSession

View File

@ -1,7 +1,8 @@
import os
from pathlib import Path
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import QuantAlgo, QuantConfig
prompts = [

View File

@ -1,7 +1,8 @@
import os
from pathlib import Path
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
if __name__ == '__main__':
prompts = [

View File

@ -3,7 +3,8 @@ import os
import click
from tensorrt_llm.llmapi import LLM, BuildConfig, SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import BuildConfig, SamplingParams
@click.command()

View File

@ -23,7 +23,7 @@ from defs.common import convert_weights, venv_check_call
from defs.conftest import llm_models_root, unittest_path
from defs.trt_test_alternative import check_call
from tensorrt_llm.llmapi import LLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi.llm_utils import BuildConfig

View File

@ -2107,8 +2107,7 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path,
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.sampling_params import SamplingParams
prompts = [
"Hello, my name is",

View File

@ -2,9 +2,9 @@ from unittest.mock import MagicMock, patch
import pytest
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm._torch.auto_deploy.shim.demollm import DemoLLM
from tensorrt_llm._torch.auto_deploy.transformations.transform import InferenceOptimizer
from tensorrt_llm.llmapi.llm import LLM
from tensorrt_llm.llmapi.llm_args import TorchCompileConfig, _AutoDeployLlmArgs
# ================================
@ -128,6 +128,7 @@ def test_config_flow(
# Create instance with appropriate mocking
with patch.object(api_class, "_try_load_tokenizer", return_value=MagicMock()):
with patch.object(api_class, "_build_model", return_value=MagicMock()):
instance = api_class(**config_params)
# Verify args were created correctly

View File

@ -7,8 +7,7 @@ import pytest
from utils.llm_data import llm_models_root
from utils.util import getSMVersion
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
from tensorrt_llm.llmapi.utils import get_total_gpu_memory

View File

@ -2,7 +2,7 @@ import torch
from utils.llm_data import llm_models_root
from utils.util import skip_gpu_memory_less_than
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm.llmapi.llm import RequestOutput
from tensorrt_llm.sampling_params import SamplingParams

View File

@ -2,7 +2,7 @@ import unittest
from parameterized import parameterized
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm.sampling_params import SamplingParams

View File

@ -5,8 +5,7 @@ import pytest
import torch
from utils.llm_data import llm_models_root
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm.llmapi.utils import get_total_gpu_memory
from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig

View File

@ -7,8 +7,7 @@ import torch
from utils.llm_data import llm_models_root
from utils.util import getSMVersion
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm.llmapi.utils import get_total_gpu_memory

View File

@ -4,8 +4,7 @@ import pytest
import torch
from utils.llm_data import llm_models_root
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import KvCacheConfig

View File

@ -5,8 +5,7 @@ import unittest
import pytest
import torch
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

View File

@ -5,8 +5,7 @@ import unittest
import pytest
import torch
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

View File

@ -5,8 +5,7 @@ import unittest
import pytest
import torch
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import KvCacheConfig, NGramDecodingConfig
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

View File

@ -4,8 +4,7 @@ from pathlib import Path
import pytest
from utils.llm_data import llm_models_root
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig

View File

@ -5,8 +5,7 @@ import torch
from utils.llm_data import llm_models_root
from utils.util import force_ampere
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi.llm_utils import BuildConfig, KvCacheConfig
prompts = ["A B C"]

View File

@ -5,8 +5,7 @@ import pytest
from utils.llm_data import llm_models_root
from utils.util import similar
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig

View File

@ -17,9 +17,10 @@ import yaml
from pydantic import BaseModel
import tensorrt_llm
from tensorrt_llm import LLM
from tensorrt_llm.executor import GenerationResult
from tensorrt_llm.executor.result import TokenLogprobs
from tensorrt_llm.llmapi import (LLM, CalibConfig, CompletionOutput,
from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput,
GuidedDecodingParams, QuantConfig,
RequestOutput, SamplingParams)
from tensorrt_llm.llmapi.llm_utils import LlmArgs
@ -366,8 +367,14 @@ class ClassSnapshot:
if self.properties.keys() != other.properties.keys():
diff_keys = set(self.properties.keys()) ^ set(
other.properties.keys())
this_diff_keys = set(self.properties.keys()) - set(
other.properties.keys())
other_diff_keys = set(other.properties.keys()) - set(
self.properties.keys())
raise AssertionError(
f"{qual_name} has different properties: {diff_keys}")
f"{qual_name} has different properties: {diff_keys}\n"
f"This class has extra properties: {this_diff_keys}\n"
f"The reference has extra properties: {other_diff_keys}")
for name, prop in self.properties.items():
with StackTrace().push(name):

View File

@ -2,89 +2,44 @@ methods:
__init__:
parameters:
# Parallelism
cp_config:
annotation: Optional[dict]
default: null
auto_parallel:
annotation: bool
default: false
auto_parallel_world_size:
annotation: Optional[int]
default: null
embedding_parallel_mode:
annotation: str
default: SHARDING_ALONG_VOCAB
moe_cluster_parallel_size:
annotation: Optional[int]
default: null
# Engine building
build_config:
annotation: Optional[tensorrt_llm.builder.BuildConfig]
default: null
enable_build_cache:
annotation: Union[tensorrt_llm.llmapi.build_cache.BuildCacheConfig, bool]
default: false
fast_build:
annotation: bool
default: false
# Bindings and mirrored configs
batching_type:
annotation: Optional[tensorrt_llm.llmapi.llm_args.BatchingType]
default: null
peft_cache_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
default: null
scheduler_config:
annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
default: null
extended_runtime_perf_knob_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig]
default: null
decoding_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.DecodingConfig]
default: null
cache_transceiver_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]
default: null
# Misc
backend:
annotation: Optional[str]
default: null
enable_attention_dp:
annotation: bool
default: false
normalize_log_probs:
annotation: bool
default: false
gather_generation_logits:
annotation: bool
default: false
gpus_per_node:
annotation: Optional[int]
default: null
moe_cluster_parallel_size:
annotation: Optional[int]
default: null
enable_attention_dp:
annotation: bool
default: False
cp_config:
annotation: Optional[dict]
default: null
# Stats
iter_stats_max_iterations:
annotation: Optional[int]
default: null
request_stats_max_iterations:
annotation: Optional[int]
default: null
workspace:
annotation: Optional[str]
# Bindings and mirrored configs
peft_cache_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
default: null
# LoRA
max_lora_rank:
annotation: Optional[int]
scheduler_config:
annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
default: null
max_loras:
annotation: int
default: 4
max_cpu_loras:
annotation: int
default: 4
allreduce_strategy:
annotation: Optional[Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT', 'LOWPRECISION', 'MNNVL']]
default: AUTO
# postproc worker
cache_transceiver_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]
default: null
batching_type:
annotation: Optional[tensorrt_llm.llmapi.llm_args.BatchingType]
default: null
normalize_log_probs:
annotation: bool
default: False
gather_generation_logits:
annotation: bool
default: False
num_postprocess_workers:
annotation: int
default: 0
@ -98,10 +53,73 @@ methods:
reasoning_parser:
annotation: Optional[str]
default: null
# kwargs
kwargs:
annotation: Any
default: inspect._empty
garbage_collection_gen0_threshold:
annotation: int
default: 20000
# Misc
backend:
annotation: Optional[str]
default: null
build_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.BuildConfig]
default: null
use_cuda_graph:
annotation: bool
default: False
cuda_graph_batch_sizes:
annotation: Optional[List[int]]
default: null
cuda_graph_max_batch_size:
annotation: int
default: 0
cuda_graph_padding_enabled:
annotation: bool
default: False
disable_overlap_scheduler:
annotation: bool
default: False
moe_max_num_tokens:
annotation: Optional[int]
default: null
moe_load_balancer:
annotation: Union[tensorrt_llm._torch.MoeLoadBalancerConfig, str, None]
default: null
attn_backend:
annotation: str
default: TRTLLM
moe_backend:
annotation: str
default: CUTLASS
mixed_sampler:
annotation: bool
default: False
enable_trtllm_sampler:
annotation: bool
default: False
kv_cache_dtype:
annotation: str
default: auto
enable_iter_perf_stats:
annotation: bool
default: False
enable_iter_req_stats:
annotation: bool
default: False
print_iter_log:
annotation: bool
default: False
torch_compile_config:
annotation: Optional[tensorrt_llm.llmapi.llm_args.TorchCompileConfig]
default: null
autotuner_enabled:
annotation: bool
default: True
enable_layerwise_nvtx_marker:
annotation: bool
default: False
enable_min_latency:
annotation: bool
default: False
return_annotation: None
generate:
parameters:
@ -145,19 +163,10 @@ methods:
annotation: Optional[float]
default: 2
return_annotation: tensorrt_llm.executor.result.IterationResult
save:
parameters:
engine_dir:
annotation: str
default: inspect._empty
return_annotation: None
shutdown:
parameters: {}
return_annotation: None
properties:
workspace:
annotation: pathlib.Path
default: inspect._empty
llm_id:
annotation: str
default: inspect._empty

View File

@ -95,8 +95,8 @@ methods:
default: null
# Misc
load_format:
annotation: Literal['auto', 'dummy']
default: auto
annotation: Union[str, tensorrt_llm.llmapi.llm_args.LoadFormat]
default: 0
enable_tqdm:
annotation: bool
default: false
@ -106,9 +106,10 @@ methods:
kv_cache_config:
annotation: tensorrt_llm.llmapi.llm_args.KvCacheConfig
default: null
garbage_collection_gen0_threshold:
annotation: int
default: 20000
kwargs:
annotation: Any
default: inspect._empty
return_annotation: None
generate:
parameters:

View File

@ -5,8 +5,10 @@ import pytest
from api_stability_core import (ApiStabilityTestHarness, ClassSnapshot,
MethodSnapshot)
from tensorrt_llm import LLM
from tensorrt_llm.bindings import executor as tllme
from tensorrt_llm.llmapi import (LLM, CalibConfig, CompletionOutput,
from tensorrt_llm.executor.result import IterationResult
from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput,
GuidedDecodingParams, QuantConfig,
RequestOutput)
from tensorrt_llm.sampling_params import (BatchedLogitsProcessor,
@ -130,21 +132,28 @@ class TestLLM(ApiStabilityTestHarness):
def test_modified_method_with_same_signature(self, mocker):
def new_save(self, engine_dir: str) -> None:
def new_get_stats_async(self,
timeout: Optional[float] = 2
) -> IterationResult:
pass
new_save.__doc__ = self.TEST_CLASS.save.__doc__
new_get_stats_async.__doc__ = self.TEST_CLASS.get_stats_async.__doc__
mocker.patch.object(self.TEST_CLASS, "save", new=new_save)
mocker.patch.object(self.TEST_CLASS,
"get_stats_async",
new=new_get_stats_async)
self.test_signature()
self.test_docstring()
def test_modified_method_with_modified_signature(self, mocker):
def new_save(self, engine_dir: Optional[str]) -> None:
def new_get_stats_async(self,
timeout: Optional[int] = 2) -> IterationResult:
pass
mocker.patch.object(self.TEST_CLASS, "save", new=new_save)
mocker.patch.object(self.TEST_CLASS,
"get_stats_async",
new=new_get_stats_async)
with pytest.raises(AssertionError):
self.test_signature()
with pytest.raises(AssertionError):

View File

@ -9,8 +9,8 @@ import pytest
from utils.util import (skip_gpu_memory_less_than_40gb, skip_num_gpus_less_than,
skip_nvlink_inactive)
from tensorrt_llm import LLM
from tensorrt_llm.llmapi import BuildConfig
from tensorrt_llm.llmapi.llm import LLM
from ..test_llm import get_model_path
from .openai_server import RemoteOpenAIServer

View File

@ -4,7 +4,7 @@ import pytest
from fastapi.testclient import TestClient
from transformers import AutoTokenizer
from tensorrt_llm._torch.llm import LLM as PyTorchLLM
from tensorrt_llm import LLM as PyTorchLLM
from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig
from tensorrt_llm.serve.openai_server import OpenAIServer

View File

@ -10,8 +10,8 @@ import pytest
from utils.util import (skip_gpu_memory_less_than_40gb, skip_pre_ada,
skip_single_gpu)
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import BuildConfig
from tensorrt_llm.llmapi.llm import LLM
from tensorrt_llm.llmapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig
from ..test_llm import get_model_path

View File

@ -4,7 +4,9 @@ from typing import Optional
import click
from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams
from tensorrt_llm import LLM as TorchLLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams
@click.command()
@ -20,7 +22,6 @@ def main(model_dir: str, tp_size: int, engine_dir: Optional[str], n: int,
best_of: Optional[int], top_k: int, use_beam_search: bool,
use_pytorch: bool):
if use_pytorch:
from tensorrt_llm._torch.llm import LLM as TorchLLM
llm = TorchLLM(
model_dir,
tensor_parallel_size=tp_size,

View File

@ -6,9 +6,10 @@ from typing import Optional
import click
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.executor import GenerationResultBase
from tensorrt_llm.executor.postproc_worker import PostprocArgs, PostprocParams
from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams
from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams
from tensorrt_llm.llmapi.utils import print_colored
from tensorrt_llm.serve.openai_protocol import (
ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse,

View File

@ -10,6 +10,7 @@ import pytest
import torch
import zmq
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm._utils import mpi_world_size
from tensorrt_llm.bindings import executor as tllm
from tensorrt_llm.executor import (DetokenizedGenerationResultBase,
@ -17,7 +18,7 @@ from tensorrt_llm.executor import (DetokenizedGenerationResultBase,
GenerationResult, GenerationResultBase,
PostprocWorker)
from tensorrt_llm.executor.ipc import FusedIpcQueue, ZeroMqQueue
from tensorrt_llm.llmapi import LLM, BuildConfig
from tensorrt_llm.llmapi import BuildConfig
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
from tensorrt_llm.llmapi.utils import AsyncQueue
from tensorrt_llm.sampling_params import SamplingParams

View File

@ -25,16 +25,19 @@ import torch
import transformers
from utils.util import skip_single_gpu
from tensorrt_llm import LLM as LLM_torch
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.bindings import executor as tllm
from tensorrt_llm.executor import (GenerationExecutorWorker, LoRARequest,
PromptAdapterRequest, RequestError)
from tensorrt_llm.llmapi import (LLM, BuildCacheConfig, EagleDecodingConfig,
from tensorrt_llm.llmapi import (BuildCacheConfig, EagleDecodingConfig,
KvCacheConfig, KvCacheRetentionConfig,
LookaheadDecodingConfig, MedusaDecodingConfig,
RequestOutput)
from tensorrt_llm.llmapi import TrtLlmArgs as LlmArgs
from tensorrt_llm.llmapi.llm_args import DynamicBatchConfig, SchedulerConfig
from tensorrt_llm.llmapi.llm_utils import (BuildConfig, LlmArgs, QuantAlgo,
QuantConfig, _ParallelConfig)
from tensorrt_llm.llmapi.llm_utils import (BuildConfig, QuantAlgo, QuantConfig,
_ParallelConfig)
from tensorrt_llm.llmapi.tokenizer import TokenizerBase, TransformersTokenizer
from tensorrt_llm.llmapi.utils import get_total_gpu_memory
from tensorrt_llm.lora_manager import LoraConfig
@ -118,7 +121,6 @@ def llm_test_harness(model_dir: str,
tokenizer = model_dir
if backend == "pytorch":
from tensorrt_llm._torch import LLM as LLM_torch
llm = LLM_torch(model_dir, tokenizer=tokenizer, **llm_kwargs)
else:
llm = LLM(model_dir, tokenizer=tokenizer, **llm_kwargs)
@ -1596,7 +1598,6 @@ def llm_return_logprobs_test_harness(prompt_logprobs: Optional[int],
LLM_CLASS = LLM
llm_args_extra = {}
if backend in ["pytorch", "autodeploy"]:
from tensorrt_llm._torch import LLM as LLM_torch
LLM_CLASS = LLM_torch
else:
llm_args_extra["fast_build"] = True
@ -1839,7 +1840,6 @@ def llm_get_stats_test_harness(tp_size: int = 1,
sampling_args_extra["return_context_logits"] = True
if pytorch_backend:
from tensorrt_llm._torch import LLM as LLM_torch
llm_args_extra.update(
dict(enable_iter_perf_stats=True,
enable_iter_req_stats=enable_iter_req_stats,
@ -1894,8 +1894,6 @@ def test_llm_get_queued_stats():
llm_args_extra = {}
sampling_args_extra = {}
from tensorrt_llm._torch import LLM as LLM_torch
llm_args_extra.update(
dict(enable_iter_perf_stats=True,
enable_iter_req_stats=enable_iter_req_stats,
@ -1967,7 +1965,6 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
sampling_args_extra["return_context_logits"] = True
if pytorch_backend:
from tensorrt_llm._torch import LLM as LLM_torch
llm_args_extra.update(
dict(enable_iter_perf_stats=True,
enable_iter_req_stats=enable_iter_req_stats,

View File

@ -5,8 +5,8 @@ import pytest
import yaml
import tensorrt_llm.bindings.executor as tle
from tensorrt_llm._torch.llm import LLM as TorchLLM
from tensorrt_llm.llmapi.llm import LLM
from tensorrt_llm import LLM as TorchLLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi.llm_args import *
from tensorrt_llm.llmapi.utils import print_traceback_on_error
@ -54,10 +54,10 @@ speculative_config:
f.seek(0)
dict_content = yaml.safe_load(f)
llm_args = LlmArgs(model=llama_model_path)
llm_args = TrtLlmArgs(model=llama_model_path)
llm_args_dict = update_llm_args_with_extra_dict(llm_args.to_dict(),
dict_content)
llm_args = LlmArgs(**llm_args_dict)
llm_args = TrtLlmArgs(**llm_args_dict)
assert llm_args.speculative_config.max_window_size == 4
assert llm_args.speculative_config.max_ngram_size == 3
assert llm_args.speculative_config.max_verification_set_size == 4
@ -226,10 +226,10 @@ class TestTrtLlmArgs:
def test_dynamic_setattr(self):
with pytest.raises(pydantic_core._pydantic_core.ValidationError):
args = LlmArgs(model=llama_model_path, invalid_arg=1)
args = TrtLlmArgs(model=llama_model_path, invalid_arg=1)
with pytest.raises(ValueError):
args = LlmArgs(model=llama_model_path)
args = TrtLlmArgs(model=llama_model_path)
args.invalid_arg = 1

View File

@ -1,4 +1,4 @@
from tensorrt_llm.llmapi import LLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi.utils import (download_hf_model,
download_hf_pretrained_config)

View File

@ -2,10 +2,11 @@ import asyncio
import time
import tensorrt_llm
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest
from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
from tensorrt_llm._utils import KVCacheEventSerializer
from tensorrt_llm.llmapi import LLM, KvCacheConfig
from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm.mapping import Mapping
from tensorrt_llm.sampling_params import SamplingParams

View File

@ -8,8 +8,9 @@ from typing import Optional
import pytest
from parameterized import parameterized
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.executor import GenerationExecutorProxy
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
from tensorrt_llm.mapping import Mapping
from tensorrt_llm.models import PretrainedConfig

View File

@ -1,5 +1,6 @@
import pytest
from tensorrt_llm import LLM
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
from tensorrt_llm.sampling_params import SamplingParams
@ -71,9 +72,7 @@ def test_llm_get_stats_async(return_context_logits, use_overlap,
SamplingParams() # pytorch only supports n=1
])
def test_llm_abort_request(sampling_params):
from tensorrt_llm._torch import LLM as LLM_torch
llm = LLM_torch(model=llama_model_path,
kv_cache_config=global_kvcache_config)
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
run_llm_abort_request(llm=llm, sampling_params=sampling_params)
@ -82,8 +81,7 @@ def test_llm_reward_model():
tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"]
from tensorrt_llm._torch import LLM as LLM_torch
llm = LLM_torch(model=rm_model_path,
llm = LLM(model=rm_model_path,
attn_backend="VANILLA",
disable_overlap_scheduler=True)
@ -106,8 +104,6 @@ def test_llm_with_postprocess_parallel_and_result_handler(streaming):
def llama_v2_13b_lora_test_harness(**llm_kwargs) -> None:
from tensorrt_llm._torch.llm import LLM
lora_config = LoraConfig(lora_dir=[
f"{llm_models_root()}/llama-models-v2/chinese-llama-2-lora-13b"
],
@ -134,8 +130,6 @@ def llama_v2_13b_lora_test_harness(**llm_kwargs) -> None:
def llama_7b_multi_lora_test_harness(**llm_kwargs) -> None:
from tensorrt_llm._torch.llm import LLM
hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf"
hf_lora_dir1 = f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"
hf_lora_dir2 = f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0"
@ -181,8 +175,6 @@ def test_llama_v2_13b_lora():
@skip_gpu_memory_less_than_40gb
def test_llama_7b_lora_default_modules() -> None:
from tensorrt_llm._torch.llm import LLM
lora_config = LoraConfig(max_lora_rank=64)
hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf"
@ -214,8 +206,6 @@ def test_llama_7b_multi_lora():
# https://jirasw.nvidia.com/browse/TRTLLM-5045
@skip_gpu_memory_less_than_138gb
def test_nemotron_nas_lora() -> None:
from tensorrt_llm._torch.llm import LLM
lora_config = LoraConfig(lora_dir=[
f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64"
],
@ -248,8 +238,6 @@ def test_nemotron_nas_lora() -> None:
@skip_gpu_memory_less_than_80gb
def test_codellama_fp8_with_bf16_lora() -> None:
from tensorrt_llm._torch.llm import LLM
model_dir = f"{llm_models_root()}/codellama/CodeLlama-7b-Instruct-hf/"
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
kv_cache_quant_algo=QuantAlgo.FP8)
@ -308,8 +296,6 @@ def test_codellama_fp8_with_bf16_lora() -> None:
@skip_gpu_memory_less_than_80gb
def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
from tensorrt_llm._torch.llm import LLM
model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"
target_modules = ['attn_q', 'attn_k', 'attn_v']

View File

@ -1,6 +1,7 @@
import pytest
from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams
from tensorrt_llm.llmapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig
# isort: off

View File

@ -37,7 +37,8 @@ def test_ModelLoader():
def test_CachedModelLoader():
# CachedModelLoader enables engine caching and multi-gpu building
args = LlmArgs(model=llama_model_path,
args = TrtLlmArgs(
model=llama_model_path,
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4),
enable_build_cache=True)
stats = LlmBuildStats()
@ -51,9 +52,9 @@ def test_CachedModelLoader():
def test_LlmArgs_default_gpus_per_node():
# default
llm_args = LlmArgs(model=llama_model_path)
llm_args = TrtLlmArgs(model=llama_model_path)
assert llm_args.gpus_per_node == torch.cuda.device_count()
# set explicitly
llm_args = LlmArgs(model=llama_model_path, gpus_per_node=6)
llm_args = TrtLlmArgs(model=llama_model_path, gpus_per_node=6)
assert llm_args.gpus_per_node == 6

View File

@ -41,10 +41,10 @@ from helpers import (get_input_tensor_by_name, get_output_config_from_request,
from mpi4py.futures import MPICommExecutor
from mpi4py.MPI import COMM_WORLD
from tensorrt_llm import LLM as PyTorchLLM
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch.llm import LLM as PyTorchLLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm._utils import global_mpi_rank, global_mpi_size
from tensorrt_llm.llmapi import LLM
from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_dict