[TRTLLM-5208][BREAKING CHANGE] chore: make pytorch LLM the default (#5312)

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-06-20 03:01:10 +08:00 · 2025-06-20 03:01:10 +08:00 · 9bd42ecf9b
commit 9bd42ecf9b
parent 113f6fbadd
89 changed files with 331 additions and 262 deletions
--- a/docs/source/torch.md
+++ b/docs/source/torch.md
@ -11,7 +11,7 @@ The PyTorch backend of TensorRT-LLM is available in version 0.17 and later. You

 ## Quick Start

-Here is a simple example to show how to use `tensorrt_llm._torch.LLM` API with Llama model.
+Here is a simple example to show how to use `tensorrt_llm.LLM` API with Llama model.

 ```{literalinclude} ../../examples/pytorch/quickstart.py
    :language: python
@ -24,7 +24,7 @@ The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized
 which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).

 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8')
 llm.generate("Hello, my name is")
 ```
@ -44,7 +44,7 @@ The PyTorch backend supports most of the sampling features that are supported on
 In order to use this feature, it is necessary to enable option `enable_trtllm_sampler` in the `LLM` class, and pass a `SamplingParams` object with the desired options as well. The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:

 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
          enable_trtllm_sampler=True)
 sampling_params = SamplingParams(
--- a/docs/source/torch/adding_new_model.md
+++ b/docs/source/torch/adding_new_model.md
@ -186,7 +186,7 @@ __all__ = [
 Alternatively, you can register the new model as an out-of-tree model, so that you can use the new model without touching the TensorRT-LLM codebase. To do so, place `modeling_mymodel.py` (and potentially `configuration_mymodel.py`) in your working directory, and import the modeling code in your script:

 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 import modeling_mymodel

 def main():
--- a/docs/source/torch/arch_overview.md
+++ b/docs/source/torch/arch_overview.md
@ -5,10 +5,10 @@ Besides TensorRT, PyTorch can also serve as the backend for TensorRT-LLM. This d

 ## Top Level API

-The interface for PyTorch backend is `tensorrt._torch.LLM`.
+The interface for PyTorch backend is `tensorrt_llm.LLM`.

 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model=<path_to_llama_from_hf>)
 ```

--- a/examples/apps/chat.py
+++ b/examples/apps/chat.py
@ -5,7 +5,8 @@ import click
 import colorama
 from transformers import AutoTokenizer, PreTrainedTokenizer

-from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams


 class LlmConsole(code.InteractiveConsole):
--- a/examples/apps/fastapi_server.py
+++ b/examples/apps/fastapi_server.py
@ -18,8 +18,9 @@ import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse

+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import CppExecutorError, RequestError
-from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams

 TIMEOUT_KEEP_ALIVE = 5  # seconds.

--- a/examples/auto_deploy/build_and_run_ad.py
+++ b/examples/auto_deploy/build_and_run_ad.py
@ -7,11 +7,12 @@ from typing import List, Optional, Union
 import torch
 from simple_config import SimpleConfig

+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._torch.auto_deploy.models import ModelFactoryRegistry
 from tensorrt_llm._torch.auto_deploy.shim import DemoLLM
 from tensorrt_llm._torch.auto_deploy.utils.benchmark import benchmark, store_benchmark_results
 from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
-from tensorrt_llm.llmapi.llm import LLM, RequestOutput
+from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.llmapi.llm_args import TorchCompileConfig
 from tensorrt_llm.sampling_params import SamplingParams

--- a/examples/llm-api/llm_auto_parallel.py
+++ b/examples/llm-api/llm_auto_parallel.py
@ -1,5 +1,6 @@
 ### Automatic Parallelism with LLM
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM


 def main():
--- a/examples/llm-api/llm_eagle2_decoding.py
+++ b/examples/llm-api/llm_eagle2_decoding.py
@ -1,7 +1,7 @@
 ### Generate Text Using Eagle2 Decoding

-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
                                 SamplingParams)


--- a/examples/llm-api/llm_eagle_decoding.py
+++ b/examples/llm-api/llm_eagle_decoding.py
@ -1,8 +1,8 @@
 ### Generate Text Using Eagle Decoding

-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
-                                 SamplingParams)
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig


 def main():
--- a/examples/llm-api/llm_guided_decoding.py
+++ b/examples/llm-api/llm_guided_decoding.py
@ -1,5 +1,6 @@
 ### Generate text with guided decoding
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import GuidedDecodingParams


--- a/examples/llm-api/llm_inference.py
+++ b/examples/llm-api/llm_inference.py
@ -1,7 +1,8 @@
 ### Generate text
 import tempfile

-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM


 def main():
--- a/examples/llm-api/llm_inference_async.py
+++ b/examples/llm-api/llm_inference_async.py
@ -1,7 +1,8 @@
 ### Generate Text Asynchronously
 import asyncio

-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM


 def main():
--- a/examples/llm-api/llm_inference_async_streaming.py
+++ b/examples/llm-api/llm_inference_async_streaming.py
@ -1,7 +1,8 @@
 ### Generate Text in Streaming
 import asyncio

-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM


 def main():
--- a/examples/llm-api/llm_inference_customize.py
+++ b/examples/llm-api/llm_inference_customize.py
@ -1,7 +1,8 @@
 ### Generate text with customization
 import tempfile

-from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams


 def main():
--- a/examples/llm-api/llm_inference_distributed.py
+++ b/examples/llm-api/llm_inference_distributed.py
@ -1,5 +1,6 @@
 ### Distributed LLM Generation
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM


 def main():
--- a/examples/llm-api/llm_inference_kv_events.py
+++ b/examples/llm-api/llm_inference_kv_events.py
@ -1,6 +1,7 @@
 ### Get KV Cache Events

-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import KvCacheConfig


--- a/examples/llm-api/llm_logits_processor.py
+++ b/examples/llm-api/llm_logits_processor.py
@ -3,7 +3,7 @@ from typing import List, Optional

 import torch

-from tensorrt_llm import LLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.sampling_params import (BatchedLogitsProcessor,
                                          LogitsProcessor, SamplingParams)

--- a/examples/llm-api/llm_lookahead_decoding.py
+++ b/examples/llm-api/llm_lookahead_decoding.py
@ -1,6 +1,6 @@
 ### Generate Text Using Lookahead Decoding
-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
                                 LookaheadDecodingConfig, SamplingParams)


--- a/examples/llm-api/llm_medusa_decoding.py
+++ b/examples/llm-api/llm_medusa_decoding.py
@ -2,8 +2,8 @@
 import argparse
 from pathlib import Path

-from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
                                 MedusaDecodingConfig, SamplingParams)


--- a/examples/llm-api/llm_multilora.py
+++ b/examples/llm-api/llm_multilora.py
@ -1,8 +1,9 @@
 ### Generate text with multiple LoRA adapters
 from huggingface_hub import snapshot_download

-from tensorrt_llm import LLM, BuildConfig
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import LoRARequest
+from tensorrt_llm.llmapi import BuildConfig
 from tensorrt_llm.lora_manager import LoraConfig


--- a/examples/llm-api/llm_quantization.py
+++ b/examples/llm-api/llm_quantization.py
@ -3,7 +3,8 @@ import logging

 import torch

-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import CalibConfig, QuantAlgo, QuantConfig

 major, minor = torch.cuda.get_device_capability()
--- a/examples/llm-api/quickstart_example.py
+++ b/examples/llm-api/quickstart_example.py
@ -1,4 +1,5 @@
-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM


 def main():
--- a/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py
+++ b/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py
@ -32,12 +32,12 @@ from packaging.version import parse
 from tqdm import tqdm

 import tensorrt_llm
-from tensorrt_llm._torch import LLM as TORCH_LLM
+from tensorrt_llm import LLM as TORCH_LLM
+from tensorrt_llm._tensorrt_engine import LLM as TRT_LLM
 from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.bindings.executor import DecodingConfig
 from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
 from tensorrt_llm.llmapi import RequestOutput, SamplingParams
-from tensorrt_llm.llmapi.llm import LLM as TRT_LLM

 logger = logging.getLogger(__name__)

--- a/examples/pytorch/out_of_tree_example/main.py
+++ b/examples/pytorch/out_of_tree_example/main.py
@ -1,6 +1,6 @@
 import modeling_opt  # noqa

-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM


 def main():
--- a/examples/pytorch/quickstart.py
+++ b/examples/pytorch/quickstart.py
@ -1,5 +1,4 @@
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams


 def main():
--- a/examples/pytorch/quickstart_advanced.py
+++ b/examples/pytorch/quickstart_advanced.py
@ -1,7 +1,6 @@
 import argparse

-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import (DraftTargetDecodingConfig, EagleDecodingConfig,
                                 KvCacheConfig, MTPDecodingConfig,
                                 NGramDecodingConfig, TorchCompileConfig)
--- a/examples/pytorch/star_attention.py
+++ b/examples/pytorch/star_attention.py
@ -6,8 +6,7 @@ from difflib import SequenceMatcher

 import torch

-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig


--- a/tensorrt_llm/init.py
+++ b/tensorrt_llm/init.py
@ -46,6 +46,7 @@ from .builder import BuildConfig, Builder, BuilderConfig, build
 from .disaggregated_params import DisaggregatedParams
 from .functional import Tensor, constant
 from .llmapi import LLM, LlmArgs
+from .llmapi.llm_args import LlmArgs, TorchLlmArgs, TrtLlmArgs
 from .logger import logger
 from .mapping import Mapping
 from .models.automodel import AutoConfig, AutoModelForCausalLM
@ -98,6 +99,8 @@ __all__ = [
    'tools',
    'LLM',
    'LlmArgs',
+    'TorchLlmArgs',
+    'TrtLlmArgs',
    'SamplingParams',
    'DisaggregatedParams',
    'KvCacheConfig',
--- a/tensorrt_llm/_tensorrt_engine/init.py
+++ b/tensorrt_llm/_tensorrt_engine/init.py
@ -0,0 +1,3 @@
+from tensorrt_llm.llmapi.llm import _TrtLLM as LLM
+
+__all__ = ['LLM']
--- a/tensorrt_llm/_torch/init.py
+++ b/tensorrt_llm/_torch/init.py
@ -1,3 +1,4 @@
 from .llm import LLM
+from .model_config import MoeLoadBalancerConfig

-__all__ = ["LLM"]
+__all__ = ["LLM", "MoeLoadBalancerConfig"]
--- a/tensorrt_llm/_torch/auto_deploy/shim/demollm.py
+++ b/tensorrt_llm/_torch/auto_deploy/shim/demollm.py
@ -10,11 +10,12 @@ import torch
 import torch.multiprocessing as mp
 from transformers import PreTrainedTokenizerBase

+from ...._tensorrt_engine import LLM
 from ....executor import GenerationExecutor
 from ....executor.request import GenerationRequest
 from ....executor.result import CompletionOutput, GenerationResult
 from ....inputs.registry import create_input_processor
-from ....llmapi.llm import LLM, RequestOutput
+from ....llmapi.llm import RequestOutput
 from ....llmapi.llm_args import _AutoDeployLlmArgs
 from ....llmapi.tokenizer import TokenizerBase
 from ....sampling_params import SamplingParams
--- a/tensorrt_llm/_torch/llm.py
+++ b/tensorrt_llm/_torch/llm.py
@ -1,3 +1,13 @@
-from tensorrt_llm.llmapi.llm import _TorchLLM as LLM
+from tensorrt_llm.llmapi.llm import _TorchLLM

+
+class LLM(_TorchLLM):
+
+    def __init__(self, *args, **kwargs):
+        raise ImportError(
+            "_torch.llm is deprecated, please use `from tensorrt_llm import LLM` directly"
+        )
+
+
+# Keep the LLM class to guide the users to use the default LLM class
 __all__ = ['LLM']
--- a/tensorrt_llm/bench/benchmark/low_latency.py
+++ b/tensorrt_llm/bench/benchmark/low_latency.py
@ -10,13 +10,14 @@ import yaml
 from click_option_group import (MutuallyExclusiveOptionGroup, OptionGroup,
                                optgroup)

+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bench.benchmark.utils.asynchronous import async_benchmark
 from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
 from tensorrt_llm.bench.benchmark.utils.processes import IterationWriter
 from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
 from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
 from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
-from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy
+from tensorrt_llm.llmapi import CapacitySchedulerPolicy
 from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode

 # isort: off
--- a/tensorrt_llm/bench/benchmark/throughput.py
+++ b/tensorrt_llm/bench/benchmark/throughput.py
@ -17,7 +17,8 @@ from tensorrt_llm.bench.build.build import get_model_config
 from tensorrt_llm.bench.benchmark.utils.general import (
    get_settings_from_engine, get_settings)
 # isort: on
-from tensorrt_llm._torch.llm import LLM as PyTorchLLM
+from tensorrt_llm import LLM as PyTorchLLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
 from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
 from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
@ -25,7 +26,7 @@ from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
 from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
                                           initialize_tokenizer,
                                           update_metadata_for_multimodal)
-from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy
+from tensorrt_llm.llmapi import CapacitySchedulerPolicy
 from tensorrt_llm.logger import logger
 from tensorrt_llm.sampling_params import SamplingParams

--- a/tensorrt_llm/bench/benchmark/utils/asynchronous.py
+++ b/tensorrt_llm/bench/benchmark/utils/asynchronous.py
@ -9,7 +9,8 @@ from typing import List, Optional, Set, Tuple
 from zmq import PUSH
 from zmq.asyncio import Context

-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bench.dataclasses.general import InferenceRequest
 from tensorrt_llm.bench.dataclasses.reporting import PerfItemTuple, StatsKeeper
 from tensorrt_llm.executor.postproc_worker import PostprocParams
--- a/tensorrt_llm/bench/build/build.py
+++ b/tensorrt_llm/bench/build/build.py
@ -9,7 +9,7 @@ from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
 from tensorrt_llm.bench.utils.data import create_dataset_from_stream, initialize_tokenizer
 from tensorrt_llm.bench.utils import VALID_QUANT_ALGOS
 from tensorrt_llm.builder import BuildConfig
-from tensorrt_llm.llmapi import LLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi.llm_utils import QuantConfig
 from tensorrt_llm.logger import logger
 from tensorrt_llm.quantization.mode import QuantAlgo
--- a/tensorrt_llm/commands/eval.py
+++ b/tensorrt_llm/commands/eval.py
@ -18,10 +18,11 @@ import click

 import tensorrt_llm.profiler as profiler

-from .._torch.llm import LLM as PyTorchLLM
+from .. import LLM as PyTorchLLM
+from .._tensorrt_engine import LLM
 from ..evaluate import (GSM8K, MMLU, CnnDailymail, GPQADiamond, GPQAExtended,
                        GPQAMain, JsonModeEval)
-from ..llmapi import LLM, BuildConfig, KvCacheConfig
+from ..llmapi import BuildConfig, KvCacheConfig
 from ..llmapi.llm_utils import update_llm_args_with_extra_options
 from ..logger import logger, severity_map

--- a/tensorrt_llm/commands/serve.py
+++ b/tensorrt_llm/commands/serve.py
@ -11,10 +11,11 @@ import yaml
 from strenum import StrEnum
 from torch.cuda import device_count

-from tensorrt_llm._torch.llm import LLM as PyTorchLLM
+from tensorrt_llm import LLM as PyTorchLLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._utils import mpi_rank
 from tensorrt_llm.executor.utils import LlmLauncherEnvs
-from tensorrt_llm.llmapi import (LLM, BuildConfig, CapacitySchedulerPolicy,
+from tensorrt_llm.llmapi import (BuildConfig, CapacitySchedulerPolicy,
                                 DynamicBatchConfig, KvCacheConfig,
                                 SchedulerConfig)
 from tensorrt_llm.llmapi.disagg_utils import (CtxGenServerConfig,
--- a/tensorrt_llm/evaluate/cnn_dailymail.py
+++ b/tensorrt_llm/evaluate/cnn_dailymail.py
@ -18,8 +18,9 @@ import click
 import datasets
 import evaluate

-from .._torch import LLM as PyTorchLLM
-from ..llmapi import LLM, RequestOutput
+from .. import LLM as PyTorchLLM
+from .._tensorrt_engine import LLM
+from ..llmapi import RequestOutput
 from ..logger import logger
 from ..sampling_params import SamplingParams
 from .interface import Evaluator
--- a/tensorrt_llm/evaluate/json_mode_eval.py
+++ b/tensorrt_llm/evaluate/json_mode_eval.py
@ -19,8 +19,9 @@ import click
 import datasets
 import numpy as np

-from .._torch import LLM as PyTorchLLM
-from ..llmapi import LLM, RequestOutput
+from .. import LLM as PyTorchLLM
+from .._tensorrt_engine import LLM
+from ..llmapi import RequestOutput
 from ..logger import logger
 from ..sampling_params import GuidedDecodingParams, SamplingParams
 from .interface import Evaluator
--- a/tensorrt_llm/evaluate/lm_eval.py
+++ b/tensorrt_llm/evaluate/lm_eval.py
@ -28,8 +28,9 @@ try:
 except ImportError:
    TemplateLM = object

-from .._torch import LLM as PyTorchLLM
-from ..llmapi import LLM, RequestOutput
+from .. import LLM as PyTorchLLM
+from .._tensorrt_engine import LLM
+from ..llmapi import RequestOutput
 from ..logger import logger
 from ..sampling_params import SamplingParams
 from .interface import Evaluator
--- a/tensorrt_llm/evaluate/mmlu.py
+++ b/tensorrt_llm/evaluate/mmlu.py
@ -40,8 +40,9 @@ import click
 import numpy as np
 import pandas as pd

-from .._torch import LLM as PyTorchLLM
-from ..llmapi import LLM, RequestOutput
+from .. import LLM as PyTorchLLM
+from .._tensorrt_engine import LLM
+from ..llmapi import RequestOutput
 from ..logger import logger
 from ..sampling_params import SamplingParams
 from .interface import Evaluator
--- a/tensorrt_llm/llmapi/init.py
+++ b/tensorrt_llm/llmapi/init.py
@ -2,7 +2,7 @@ from ..disaggregated_params import DisaggregatedParams
 from ..executor import CompletionOutput, RequestError
 from ..sampling_params import GuidedDecodingParams, SamplingParams
 from .build_cache import BuildCacheConfig
-from .llm import LLM, RequestOutput, _TorchLLM, _TrtLLM
+from .llm import LLM, RequestOutput
 # yapf: disable
 from .llm_args import (BatchingType, CacheTransceiverConfig, CalibConfig,
                       CapacitySchedulerPolicy, ContextChunkingPolicy,
@ -50,6 +50,4 @@ __all__ = [
    'LlmArgs',
    'TorchLlmArgs',
    'TrtLlmArgs',
-    '_TrtLLM',
-    '_TorchLLM',
 ]
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@ -97,6 +97,7 @@ TORCH_LLM_DOCSTRING = TORCH_LLMARGS_EXPLICIT_DOCSTRING + """

    Attributes:
        tokenizer (tensorrt_llm.llmapi.tokenizer.TokenizerBase, optional): The tokenizer loaded by LLM instance, if any.
+        llm_id (str): The unique ID of the LLM instance.
 """


@ -883,6 +884,9 @@ class _TorchLLM(BaseLLM):
        # TODO: deprecate backend in LLM kwargs
        kwargs.pop("backend", None)

+        # Validate that users don't pass TrtLlmArgs-specific arguments
+        self._validate_args_for_torch_backend(kwargs)
+
        super().__init__(model,
                         tokenizer,
                         tokenizer_mode,
@ -895,8 +899,28 @@ class _TorchLLM(BaseLLM):
                         backend='pytorch',
                         **kwargs)

+    def _validate_args_for_torch_backend(self, kwargs: dict) -> None:
+        """Validate that users don't pass TrtLlmArgs-specific arguments when using PyTorch backend.
+        """
+        trtllm_fields = set(TrtLlmArgs.model_fields.keys())
+        torchllm_fields = set(TorchLlmArgs.model_fields.keys())

-class LLM(_TrtLLM):
+        trtllm_specific_fields = trtllm_fields - torchllm_fields
+
+        # Check if any TrtLlmArgs-specific arguments are passed
+        trtllm_specific_args = []
+        for key in kwargs:
+            if key in trtllm_specific_fields:
+                trtllm_specific_args.append(key)
+
+        if trtllm_specific_args:
+            raise ValueError(
+                f"The following arguments are specific to TensorRT backend and cannot be used with PyTorch backend: {trtllm_specific_args}.\n"
+                f"Please use 'from tensorrt_llm._tensorrt_engine import LLM' instead to use the TensorRT backend."
+            )
+
+
+class LLM(_TorchLLM):

    def __init__(self,
                 model: Union[str, Path],
@ -915,15 +939,13 @@ class LLM(_TrtLLM):
                         revision, tokenizer_revision, **kwargs)


-_LLM_REPR = "TrtLLM"
+_LLM_REPR = "TorchLLM"

 # sphinx will ignore the LLM's docstring if it is not explicitly set
 LLM.__doc__ = \
    f"""LLM class is the main class for running a LLM model.

-    This class is an alias of {_LLM_REPR}. You can switch between the TensorRT backend
-    and the PyTorch backend by setting the TLLM_USE_TRT_ENGINE environment to 1 or 0.
-    The default backend is the TensorRT backend.
+    This class is an alias of {_LLM_REPR}.

    Parameters:
-""" + TRT_LLM_DOCSTRING
+""" + TORCH_LLM_DOCSTRING
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@ -1591,9 +1591,6 @@ class TrtLlmArgs(BaseLlmArgs):
        return self


-LlmArgs = TrtLlmArgs
-
-
 class LoadFormat(Enum):
    AUTO = 0
    # Initialize all weights randomly.
@ -1663,7 +1660,10 @@ class TorchLlmArgs(BaseLlmArgs):
    moe_load_balancer: Optional[Union[object, str]] = Field(
        default=None,
        description="Configuration for MoE load balancing.",
-        json_schema_extra={"type": "Union[MoeLoadBalancerConfig, str]"})
+        json_schema_extra={
+            "type":
+            "Union[tensorrt_llm._torch.model_config.MoeLoadBalancerConfig, str, None]"
+        })

    attn_backend: str = Field(default='TRTLLM',
                              description="Attention backend to use.")
@ -2081,6 +2081,8 @@ def get_model_format(model_dir: str) -> _ModelFormatKind:
        return model_format


+LlmArgs = TorchLlmArgs
+
 TRT_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TrtLlmArgs,
                                                                indent=' ' * 4)
 TORCH_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TorchLlmArgs,
--- a/tensorrt_llm/scaffolding/worker.py
+++ b/tensorrt_llm/scaffolding/worker.py
@ -4,8 +4,8 @@ from typing import Callable
 import openai
 from transformers import AutoTokenizer

+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import GenerationExecutor
-from tensorrt_llm.llmapi.llm import LLM
 from tensorrt_llm.llmapi.llm_args import KvCacheConfig
 from tensorrt_llm.sampling_params import SamplingParams

--- a/tensorrt_llm/serve/openai_server.py
+++ b/tensorrt_llm/serve/openai_server.py
@ -14,12 +14,12 @@ from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from transformers import AutoConfig, AutoProcessor

+from tensorrt_llm._tensorrt_engine import LLM
 # yapf: disable
 from tensorrt_llm.executor import CppExecutorError
 from tensorrt_llm.executor.postproc_worker import PostprocParams
 from tensorrt_llm.inputs import prompt_inputs
 from tensorrt_llm.inputs.utils import ConversationMessage, apply_chat_template
-from tensorrt_llm.llmapi import LLM
 from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
 from tensorrt_llm.llmapi.disagg_utils import MetadataServerConfig, ServerRole
 from tensorrt_llm.llmapi.llm import RequestOutput
--- a/tests/integration/defs/accuracy/accuracy_core.py
+++ b/tests/integration/defs/accuracy/accuracy_core.py
@ -23,10 +23,11 @@ import scipy
 import yaml

 import tensorrt_llm.evaluate
-from tensorrt_llm._torch import LLM as PyTorchLLM
+from tensorrt_llm import LLM as PyTorchLLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._torch.speculative import SpecConfig
 from tensorrt_llm.builder import BuildConfig
-from tensorrt_llm.llmapi import LLM, SamplingParams
+from tensorrt_llm.llmapi import SamplingParams
 from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig
 from tensorrt_llm.logger import logger
 from tensorrt_llm.models.modeling_utils import QuantConfig
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@ -83,7 +83,6 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
        yaml.dump(gen_server_config, f)

    args = LlmArgs.from_kwargs(model=model_name,
-                               backend="pytorch",
                               tensor_parallel_size=tensor_parallel_size)

    trtllm_serve_path = "trtllm-serve"
--- a/tests/integration/defs/accuracy/test_llm_api.py
+++ b/tests/integration/defs/accuracy/test_llm_api.py
@ -14,7 +14,8 @@
 # limitations under the License.
 import pytest

-from tensorrt_llm.llmapi import LLM, EagleDecodingConfig, KvCacheConfig
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization import QuantAlgo

--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -14,7 +14,7 @@
 # limitations under the License.
 import pytest

-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig
 from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
                                 MTPDecodingConfig, NGramDecodingConfig,
--- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@ -9,8 +9,7 @@ from defs.conftest import skip_no_hopper
 from mpi4py import MPI
 from mpi4py.futures import MPIPoolExecutor

-from tensorrt_llm import DisaggregatedParams, SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, DisaggregatedParams, SamplingParams
 from tensorrt_llm._utils import set_mpi_comm
 from tensorrt_llm.llmapi import KvCacheConfig, MpiCommSession

--- a/tests/integration/defs/examples/run_llm_fp8_quant_llama_70b.py
+++ b/tests/integration/defs/examples/run_llm_fp8_quant_llama_70b.py
@ -1,7 +1,8 @@
 import os
 from pathlib import Path

-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import QuantAlgo, QuantConfig

 prompts = [
--- a/tests/integration/defs/examples/run_llm_quickstart_atexit.py
+++ b/tests/integration/defs/examples/run_llm_quickstart_atexit.py
@ -1,7 +1,8 @@
 import os
 from pathlib import Path

-from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM

 if __name__ == '__main__':
    prompts = [
--- a/tests/integration/defs/llmapi/_run_llmapi_llm.py
+++ b/tests/integration/defs/llmapi/_run_llmapi_llm.py
@ -3,7 +3,8 @@ import os

 import click

-from tensorrt_llm.llmapi import LLM, BuildConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import BuildConfig, SamplingParams


@click.command()
--- a/tests/integration/defs/llmapi/test_llm_e2e.py
+++ b/tests/integration/defs/llmapi/test_llm_e2e.py
@ -23,7 +23,7 @@ from defs.common import convert_weights, venv_check_call
 from defs.conftest import llm_models_root, unittest_path
 from defs.trt_test_alternative import check_call

-from tensorrt_llm.llmapi import LLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi.llm_utils import BuildConfig


--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@ -2107,8 +2107,7 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path,
    import torch
    from transformers import AutoModelForSequenceClassification, AutoTokenizer

-    from tensorrt_llm import SamplingParams
-    from tensorrt_llm._torch import LLM
+    from tensorrt_llm import LLM, SamplingParams
    from tensorrt_llm.sampling_params import SamplingParams
    prompts = [
        "Hello, my name is",
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py
@ -2,9 +2,9 @@ from unittest.mock import MagicMock, patch

 import pytest

+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._torch.auto_deploy.shim.demollm import DemoLLM
 from tensorrt_llm._torch.auto_deploy.transformations.transform import InferenceOptimizer
-from tensorrt_llm.llmapi.llm import LLM
 from tensorrt_llm.llmapi.llm_args import TorchCompileConfig, _AutoDeployLlmArgs

 # ================================
@ -128,6 +128,7 @@ def test_config_flow(

    # Create instance with appropriate mocking
    with patch.object(api_class, "_try_load_tokenizer", return_value=MagicMock()):
+        with patch.object(api_class, "_build_model", return_value=MagicMock()):
            instance = api_class(**config_params)

    # Verify args were created correctly
--- a/tests/unittest/_torch/modeling/test_modeling_deepseek.py
+++ b/tests/unittest/_torch/modeling/test_modeling_deepseek.py
@ -7,8 +7,7 @@ import pytest
 from utils.llm_data import llm_models_root
 from utils.util import getSMVersion

-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory

--- a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
+++ b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
@ -2,7 +2,7 @@ import torch
 from utils.llm_data import llm_models_root
 from utils.util import skip_gpu_memory_less_than

-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.sampling_params import SamplingParams
--- a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py
+++ b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py
@ -2,7 +2,7 @@ import unittest

 from parameterized import parameterized

-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.sampling_params import SamplingParams

--- a/tests/unittest/_torch/multi_gpu/test_star_attention.py
+++ b/tests/unittest/_torch/multi_gpu/test_star_attention.py
@ -5,8 +5,7 @@ import pytest
 import torch
 from utils.llm_data import llm_models_root

-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
--- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
+++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
@ -7,8 +7,7 @@ import torch
 from utils.llm_data import llm_models_root
 from utils.util import getSMVersion

-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory

--- a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
+++ b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
@ -4,8 +4,7 @@ import pytest
 import torch
 from utils.llm_data import llm_models_root

-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig


--- a/tests/unittest/_torch/speculative/test_draft_target.py
+++ b/tests/unittest/_torch/speculative/test_draft_target.py
@ -5,8 +5,7 @@ import unittest
 import pytest
 import torch

-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig

 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
--- a/tests/unittest/_torch/speculative/test_eagle3.py
+++ b/tests/unittest/_torch/speculative/test_eagle3.py
@ -5,8 +5,7 @@ import unittest
 import pytest
 import torch

-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig

 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
--- a/tests/unittest/_torch/speculative/test_ngram.py
+++ b/tests/unittest/_torch/speculative/test_ngram.py
@ -5,8 +5,7 @@ import unittest
 import pytest
 import torch

-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig, NGramDecodingConfig

 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
--- a/tests/unittest/_torch/test_overlap_scheduler.py
+++ b/tests/unittest/_torch/test_overlap_scheduler.py
@ -4,8 +4,7 @@ from pathlib import Path
 import pytest
 from utils.llm_data import llm_models_root

-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig


--- a/tests/unittest/_torch/test_return_logits.py
+++ b/tests/unittest/_torch/test_return_logits.py
@ -5,8 +5,7 @@ import torch
 from utils.llm_data import llm_models_root
 from utils.util import force_ampere

-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi.llm_utils import BuildConfig, KvCacheConfig

 prompts = ["A B C"]
--- a/tests/unittest/_torch/test_trtllm_sampler.py
+++ b/tests/unittest/_torch/test_trtllm_sampler.py
@ -5,8 +5,7 @@ import pytest
 from utils.llm_data import llm_models_root
 from utils.util import similar

-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig


--- a/tests/unittest/api_stability/api_stability_core.py
+++ b/tests/unittest/api_stability/api_stability_core.py
@ -17,9 +17,10 @@ import yaml
 from pydantic import BaseModel

 import tensorrt_llm
+from tensorrt_llm import LLM
 from tensorrt_llm.executor import GenerationResult
 from tensorrt_llm.executor.result import TokenLogprobs
-from tensorrt_llm.llmapi import (LLM, CalibConfig, CompletionOutput,
+from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput,
                                 GuidedDecodingParams, QuantConfig,
                                 RequestOutput, SamplingParams)
 from tensorrt_llm.llmapi.llm_utils import LlmArgs
@ -366,8 +367,14 @@ class ClassSnapshot:
        if self.properties.keys() != other.properties.keys():
            diff_keys = set(self.properties.keys()) ^ set(
                other.properties.keys())
+            this_diff_keys = set(self.properties.keys()) - set(
+                other.properties.keys())
+            other_diff_keys = set(other.properties.keys()) - set(
+                self.properties.keys())
            raise AssertionError(
-                f"{qual_name} has different properties: {diff_keys}")
+                f"{qual_name} has different properties: {diff_keys}\n"
+                f"This class has extra properties: {this_diff_keys}\n"
+                f"The reference has extra properties: {other_diff_keys}")

        for name, prop in self.properties.items():
            with StackTrace().push(name):
--- a/tests/unittest/api_stability/references/llm.yaml
+++ b/tests/unittest/api_stability/references/llm.yaml
@ -2,89 +2,44 @@ methods:
  __init__:
    parameters:
      # Parallelism
-      cp_config:
-        annotation: Optional[dict]
-        default: null
-      auto_parallel:
-        annotation: bool
-        default: false
-      auto_parallel_world_size:
-        annotation: Optional[int]
-        default: null
-      embedding_parallel_mode:
-        annotation: str
-        default: SHARDING_ALONG_VOCAB
-      moe_cluster_parallel_size:
-        annotation: Optional[int]
-        default: null
-      # Engine building
-      build_config:
-        annotation: Optional[tensorrt_llm.builder.BuildConfig]
-        default: null
-      enable_build_cache:
-        annotation: Union[tensorrt_llm.llmapi.build_cache.BuildCacheConfig, bool]
-        default: false
-      fast_build:
-        annotation: bool
-        default: false
-      # Bindings and mirrored configs
-      batching_type:
-        annotation: Optional[tensorrt_llm.llmapi.llm_args.BatchingType]
-        default: null
-      peft_cache_config:
-        annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
-        default: null
-      scheduler_config:
-        annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
-        default: null
-      extended_runtime_perf_knob_config:
-        annotation: Optional[tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig]
-        default: null
-      decoding_config:
-        annotation: Optional[tensorrt_llm.llmapi.llm_args.DecodingConfig]
-        default: null
-      cache_transceiver_config:
-        annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]
-        default: null
-      # Misc
-      backend:
-        annotation: Optional[str]
-        default: null
-      enable_attention_dp:
-        annotation: bool
-        default: false
-      normalize_log_probs:
-        annotation: bool
-        default: false
-      gather_generation_logits:
-        annotation: bool
-        default: false
      gpus_per_node:
        annotation: Optional[int]
        default: null
+      moe_cluster_parallel_size:
+        annotation: Optional[int]
+        default: null
+      enable_attention_dp:
+        annotation: bool
+        default: False
+      cp_config:
+        annotation: Optional[dict]
+        default: null
+      # Stats
      iter_stats_max_iterations:
        annotation: Optional[int]
        default: null
      request_stats_max_iterations:
        annotation: Optional[int]
        default: null
-      workspace:
-        annotation: Optional[str]
+      # Bindings and mirrored configs
+      peft_cache_config:
+        annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
        default: null
-      # LoRA
-      max_lora_rank:
-        annotation: Optional[int]
+      scheduler_config:
+        annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
        default: null
-      max_loras:
-        annotation: int
-        default: 4
-      max_cpu_loras:
-        annotation: int
-        default: 4
-      allreduce_strategy:
-        annotation: Optional[Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT', 'LOWPRECISION', 'MNNVL']]
-        default: AUTO
-      # postproc worker
+      cache_transceiver_config:
+        annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]
+        default: null
+      batching_type:
+        annotation: Optional[tensorrt_llm.llmapi.llm_args.BatchingType]
+        default: null
+      normalize_log_probs:
+        annotation: bool
+        default: False
+      gather_generation_logits:
+        annotation: bool
+        default: False
      num_postprocess_workers:
        annotation: int
        default: 0
@ -98,10 +53,73 @@ methods:
      reasoning_parser:
        annotation: Optional[str]
        default: null
-      # kwargs
-      kwargs:
-        annotation: Any
-        default: inspect._empty
+      garbage_collection_gen0_threshold:
+        annotation: int
+        default: 20000
+      # Misc
+      backend:
+        annotation: Optional[str]
+        default: null
+      build_config:
+        annotation: Optional[tensorrt_llm.llmapi.llm_args.BuildConfig]
+        default: null
+      use_cuda_graph:
+        annotation: bool
+        default: False
+      cuda_graph_batch_sizes:
+        annotation: Optional[List[int]]
+        default: null
+      cuda_graph_max_batch_size:
+        annotation: int
+        default: 0
+      cuda_graph_padding_enabled:
+        annotation: bool
+        default: False
+      disable_overlap_scheduler:
+        annotation: bool
+        default: False
+      moe_max_num_tokens:
+        annotation: Optional[int]
+        default: null
+      moe_load_balancer:
+        annotation: Union[tensorrt_llm._torch.MoeLoadBalancerConfig, str, None]
+        default: null
+      attn_backend:
+        annotation: str
+        default: TRTLLM
+      moe_backend:
+        annotation: str
+        default: CUTLASS
+      mixed_sampler:
+        annotation: bool
+        default: False
+      enable_trtllm_sampler:
+        annotation: bool
+        default: False
+      kv_cache_dtype:
+        annotation: str
+        default: auto
+      enable_iter_perf_stats:
+        annotation: bool
+        default: False
+      enable_iter_req_stats:
+        annotation: bool
+        default: False
+      print_iter_log:
+        annotation: bool
+        default: False
+      torch_compile_config:
+        annotation: Optional[tensorrt_llm.llmapi.llm_args.TorchCompileConfig]
+        default: null
+      autotuner_enabled:
+        annotation: bool
+        default: True
+      enable_layerwise_nvtx_marker:
+        annotation: bool
+        default: False
+      enable_min_latency:
+        annotation: bool
+        default: False
    return_annotation: None
  generate:
    parameters:
@ -145,19 +163,10 @@ methods:
        annotation: Optional[float]
        default: 2
    return_annotation: tensorrt_llm.executor.result.IterationResult
-  save:
-    parameters:
-      engine_dir:
-        annotation: str
-        default: inspect._empty
-    return_annotation: None
  shutdown:
    parameters: {}
    return_annotation: None
 properties:
-  workspace:
-    annotation: pathlib.Path
-    default: inspect._empty
  llm_id:
    annotation: str
    default: inspect._empty
--- a/tests/unittest/api_stability/references_committed/llm.yaml
+++ b/tests/unittest/api_stability/references_committed/llm.yaml
@ -95,8 +95,8 @@ methods:
        default: null
      # Misc
      load_format:
-        annotation: Literal['auto', 'dummy']
-        default: auto
+        annotation: Union[str, tensorrt_llm.llmapi.llm_args.LoadFormat]
+        default: 0
      enable_tqdm:
        annotation: bool
        default: false
@ -106,9 +106,10 @@ methods:
      kv_cache_config:
        annotation: tensorrt_llm.llmapi.llm_args.KvCacheConfig
        default: null
-      garbage_collection_gen0_threshold:
-        annotation: int
-        default: 20000
+
+      kwargs:
+        annotation: Any
+        default: inspect._empty
    return_annotation: None
  generate:
    parameters:
--- a/tests/unittest/api_stability/test_llm_api.py
+++ b/tests/unittest/api_stability/test_llm_api.py
@ -5,8 +5,10 @@ import pytest
 from api_stability_core import (ApiStabilityTestHarness, ClassSnapshot,
                                MethodSnapshot)

+from tensorrt_llm import LLM
 from tensorrt_llm.bindings import executor as tllme
-from tensorrt_llm.llmapi import (LLM, CalibConfig, CompletionOutput,
+from tensorrt_llm.executor.result import IterationResult
+from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput,
                                 GuidedDecodingParams, QuantConfig,
                                 RequestOutput)
 from tensorrt_llm.sampling_params import (BatchedLogitsProcessor,
@ -130,21 +132,28 @@ class TestLLM(ApiStabilityTestHarness):

    def test_modified_method_with_same_signature(self, mocker):

-        def new_save(self, engine_dir: str) -> None:
+        def new_get_stats_async(self,
+                                timeout: Optional[float] = 2
+                                ) -> IterationResult:
            pass

-        new_save.__doc__ = self.TEST_CLASS.save.__doc__
+        new_get_stats_async.__doc__ = self.TEST_CLASS.get_stats_async.__doc__

-        mocker.patch.object(self.TEST_CLASS, "save", new=new_save)
+        mocker.patch.object(self.TEST_CLASS,
+                            "get_stats_async",
+                            new=new_get_stats_async)
        self.test_signature()
        self.test_docstring()

    def test_modified_method_with_modified_signature(self, mocker):

-        def new_save(self, engine_dir: Optional[str]) -> None:
+        def new_get_stats_async(self,
+                                timeout: Optional[int] = 2) -> IterationResult:
            pass

-        mocker.patch.object(self.TEST_CLASS, "save", new=new_save)
+        mocker.patch.object(self.TEST_CLASS,
+                            "get_stats_async",
+                            new=new_get_stats_async)
        with pytest.raises(AssertionError):
            self.test_signature()
        with pytest.raises(AssertionError):
--- a/tests/unittest/llmapi/apps/_test_openai_consistent_chat.py
+++ b/tests/unittest/llmapi/apps/_test_openai_consistent_chat.py
@ -9,8 +9,8 @@ import pytest
 from utils.util import (skip_gpu_memory_less_than_40gb, skip_num_gpus_less_than,
                        skip_nvlink_inactive)

+from tensorrt_llm import LLM
 from tensorrt_llm.llmapi import BuildConfig
-from tensorrt_llm.llmapi.llm import LLM

 from ..test_llm import get_model_path
 from .openai_server import RemoteOpenAIServer
--- a/tests/unittest/llmapi/apps/_test_openai_metrics.py
+++ b/tests/unittest/llmapi/apps/_test_openai_metrics.py
@ -4,7 +4,7 @@ import pytest
 from fastapi.testclient import TestClient
 from transformers import AutoTokenizer

-from tensorrt_llm._torch.llm import LLM as PyTorchLLM
+from tensorrt_llm import LLM as PyTorchLLM
 from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig
 from tensorrt_llm.serve.openai_server import OpenAIServer

--- a/tests/unittest/llmapi/apps/_test_openai_multi_chat.py
+++ b/tests/unittest/llmapi/apps/_test_openai_multi_chat.py
@ -10,8 +10,8 @@ import pytest
 from utils.util import (skip_gpu_memory_less_than_40gb, skip_pre_ada,
                        skip_single_gpu)

+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import BuildConfig
-from tensorrt_llm.llmapi.llm import LLM
 from tensorrt_llm.llmapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig

 from ..test_llm import get_model_path
--- a/tests/unittest/llmapi/run_llm.py
+++ b/tests/unittest/llmapi/run_llm.py
@ -4,7 +4,9 @@ from typing import Optional

 import click

-from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams
+from tensorrt_llm import LLM as TorchLLM
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams


@click.command()
@ -20,7 +22,6 @@ def main(model_dir: str, tp_size: int, engine_dir: Optional[str], n: int,
         best_of: Optional[int], top_k: int, use_beam_search: bool,
         use_pytorch: bool):
    if use_pytorch:
-        from tensorrt_llm._torch.llm import LLM as TorchLLM
        llm = TorchLLM(
            model_dir,
            tensor_parallel_size=tp_size,
--- a/tests/unittest/llmapi/run_llm_with_postproc.py
+++ b/tests/unittest/llmapi/run_llm_with_postproc.py
@ -6,9 +6,10 @@ from typing import Optional

 import click

+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import GenerationResultBase
 from tensorrt_llm.executor.postproc_worker import PostprocArgs, PostprocParams
-from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams
+from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams
 from tensorrt_llm.llmapi.utils import print_colored
 from tensorrt_llm.serve.openai_protocol import (
    ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse,
--- a/tests/unittest/llmapi/test_executor.py
+++ b/tests/unittest/llmapi/test_executor.py
@ -10,6 +10,7 @@ import pytest
 import torch
 import zmq

+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._utils import mpi_world_size
 from tensorrt_llm.bindings import executor as tllm
 from tensorrt_llm.executor import (DetokenizedGenerationResultBase,
@ -17,7 +18,7 @@ from tensorrt_llm.executor import (DetokenizedGenerationResultBase,
                                   GenerationResult, GenerationResultBase,
                                   PostprocWorker)
 from tensorrt_llm.executor.ipc import FusedIpcQueue, ZeroMqQueue
-from tensorrt_llm.llmapi import LLM, BuildConfig
+from tensorrt_llm.llmapi import BuildConfig
 from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
 from tensorrt_llm.llmapi.utils import AsyncQueue
 from tensorrt_llm.sampling_params import SamplingParams
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@ -25,16 +25,19 @@ import torch
 import transformers
 from utils.util import skip_single_gpu

+from tensorrt_llm import LLM as LLM_torch
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bindings import executor as tllm
 from tensorrt_llm.executor import (GenerationExecutorWorker, LoRARequest,
                                   PromptAdapterRequest, RequestError)
-from tensorrt_llm.llmapi import (LLM, BuildCacheConfig, EagleDecodingConfig,
+from tensorrt_llm.llmapi import (BuildCacheConfig, EagleDecodingConfig,
                                 KvCacheConfig, KvCacheRetentionConfig,
                                 LookaheadDecodingConfig, MedusaDecodingConfig,
                                 RequestOutput)
+from tensorrt_llm.llmapi import TrtLlmArgs as LlmArgs
 from tensorrt_llm.llmapi.llm_args import DynamicBatchConfig, SchedulerConfig
-from tensorrt_llm.llmapi.llm_utils import (BuildConfig, LlmArgs, QuantAlgo,
-                                           QuantConfig, _ParallelConfig)
+from tensorrt_llm.llmapi.llm_utils import (BuildConfig, QuantAlgo, QuantConfig,
+                                           _ParallelConfig)
 from tensorrt_llm.llmapi.tokenizer import TokenizerBase, TransformersTokenizer
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 from tensorrt_llm.lora_manager import LoraConfig
@ -118,7 +121,6 @@ def llm_test_harness(model_dir: str,
        tokenizer = model_dir

    if backend == "pytorch":
-        from tensorrt_llm._torch import LLM as LLM_torch
        llm = LLM_torch(model_dir, tokenizer=tokenizer, **llm_kwargs)
    else:
        llm = LLM(model_dir, tokenizer=tokenizer, **llm_kwargs)
@ -1596,7 +1598,6 @@ def llm_return_logprobs_test_harness(prompt_logprobs: Optional[int],
    LLM_CLASS = LLM
    llm_args_extra = {}
    if backend in ["pytorch", "autodeploy"]:
-        from tensorrt_llm._torch import LLM as LLM_torch
        LLM_CLASS = LLM_torch
    else:
        llm_args_extra["fast_build"] = True
@ -1839,7 +1840,6 @@ def llm_get_stats_test_harness(tp_size: int = 1,
        sampling_args_extra["return_context_logits"] = True

    if pytorch_backend:
-        from tensorrt_llm._torch import LLM as LLM_torch
        llm_args_extra.update(
            dict(enable_iter_perf_stats=True,
                 enable_iter_req_stats=enable_iter_req_stats,
@ -1894,8 +1894,6 @@ def test_llm_get_queued_stats():
    llm_args_extra = {}
    sampling_args_extra = {}

-    from tensorrt_llm._torch import LLM as LLM_torch
-
    llm_args_extra.update(
        dict(enable_iter_perf_stats=True,
             enable_iter_req_stats=enable_iter_req_stats,
@ -1967,7 +1965,6 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
        sampling_args_extra["return_context_logits"] = True

    if pytorch_backend:
-        from tensorrt_llm._torch import LLM as LLM_torch
        llm_args_extra.update(
            dict(enable_iter_perf_stats=True,
                 enable_iter_req_stats=enable_iter_req_stats,
--- a/tests/unittest/llmapi/test_llm_args.py
+++ b/tests/unittest/llmapi/test_llm_args.py
@ -5,8 +5,8 @@ import pytest
 import yaml

 import tensorrt_llm.bindings.executor as tle
-from tensorrt_llm._torch.llm import LLM as TorchLLM
-from tensorrt_llm.llmapi.llm import LLM
+from tensorrt_llm import LLM as TorchLLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi.llm_args import *
 from tensorrt_llm.llmapi.utils import print_traceback_on_error

@ -54,10 +54,10 @@ speculative_config:
        f.seek(0)
        dict_content = yaml.safe_load(f)

-    llm_args = LlmArgs(model=llama_model_path)
+    llm_args = TrtLlmArgs(model=llama_model_path)
    llm_args_dict = update_llm_args_with_extra_dict(llm_args.to_dict(),
                                                    dict_content)
-    llm_args = LlmArgs(**llm_args_dict)
+    llm_args = TrtLlmArgs(**llm_args_dict)
    assert llm_args.speculative_config.max_window_size == 4
    assert llm_args.speculative_config.max_ngram_size == 3
    assert llm_args.speculative_config.max_verification_set_size == 4
@ -226,10 +226,10 @@ class TestTrtLlmArgs:

    def test_dynamic_setattr(self):
        with pytest.raises(pydantic_core._pydantic_core.ValidationError):
-            args = LlmArgs(model=llama_model_path, invalid_arg=1)
+            args = TrtLlmArgs(model=llama_model_path, invalid_arg=1)

        with pytest.raises(ValueError):
-            args = LlmArgs(model=llama_model_path)
+            args = TrtLlmArgs(model=llama_model_path)
            args.invalid_arg = 1


--- a/tests/unittest/llmapi/test_llm_download.py
+++ b/tests/unittest/llmapi/test_llm_download.py
@ -1,4 +1,4 @@
-from tensorrt_llm.llmapi import LLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi.utils import (download_hf_model,
                                       download_hf_pretrained_config)

--- a/tests/unittest/llmapi/test_llm_kv_cache_events.py
+++ b/tests/unittest/llmapi/test_llm_kv_cache_events.py
@ -2,10 +2,11 @@ import asyncio
 import time

 import tensorrt_llm
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm._utils import KVCacheEventSerializer
-from tensorrt_llm.llmapi import LLM, KvCacheConfig
+from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.sampling_params import SamplingParams

--- a/tests/unittest/llmapi/test_llm_multi_gpu.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu.py
@ -8,8 +8,9 @@ from typing import Optional
 import pytest
 from parameterized import parameterized

+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.executor import GenerationExecutorProxy
-from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
 from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models import PretrainedConfig
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@ -1,5 +1,6 @@
 import pytest

+from tensorrt_llm import LLM
 from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
 from tensorrt_llm.sampling_params import SamplingParams

@ -71,9 +72,7 @@ def test_llm_get_stats_async(return_context_logits, use_overlap,
        SamplingParams()  # pytorch only supports n=1
    ])
 def test_llm_abort_request(sampling_params):
-    from tensorrt_llm._torch import LLM as LLM_torch
-    llm = LLM_torch(model=llama_model_path,
-                    kv_cache_config=global_kvcache_config)
+    llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
    run_llm_abort_request(llm=llm, sampling_params=sampling_params)


@ -82,8 +81,7 @@ def test_llm_reward_model():
    tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
    tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"]

-    from tensorrt_llm._torch import LLM as LLM_torch
-    llm = LLM_torch(model=rm_model_path,
+    llm = LLM(model=rm_model_path,
              attn_backend="VANILLA",
              disable_overlap_scheduler=True)

@ -106,8 +104,6 @@ def test_llm_with_postprocess_parallel_and_result_handler(streaming):


 def llama_v2_13b_lora_test_harness(**llm_kwargs) -> None:
-    from tensorrt_llm._torch.llm import LLM
-
    lora_config = LoraConfig(lora_dir=[
        f"{llm_models_root()}/llama-models-v2/chinese-llama-2-lora-13b"
    ],
@ -134,8 +130,6 @@ def llama_v2_13b_lora_test_harness(**llm_kwargs) -> None:


 def llama_7b_multi_lora_test_harness(**llm_kwargs) -> None:
-    from tensorrt_llm._torch.llm import LLM
-
    hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf"
    hf_lora_dir1 = f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"
    hf_lora_dir2 = f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0"
@ -181,8 +175,6 @@ def test_llama_v2_13b_lora():

@skip_gpu_memory_less_than_40gb
 def test_llama_7b_lora_default_modules() -> None:
-    from tensorrt_llm._torch.llm import LLM
-
    lora_config = LoraConfig(max_lora_rank=64)

    hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf"
@ -214,8 +206,6 @@ def test_llama_7b_multi_lora():
 # https://jirasw.nvidia.com/browse/TRTLLM-5045
@skip_gpu_memory_less_than_138gb
 def test_nemotron_nas_lora() -> None:
-    from tensorrt_llm._torch.llm import LLM
-
    lora_config = LoraConfig(lora_dir=[
        f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64"
    ],
@ -248,8 +238,6 @@ def test_nemotron_nas_lora() -> None:

@skip_gpu_memory_less_than_80gb
 def test_codellama_fp8_with_bf16_lora() -> None:
-    from tensorrt_llm._torch.llm import LLM
-
    model_dir = f"{llm_models_root()}/codellama/CodeLlama-7b-Instruct-hf/"
    quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
                               kv_cache_quant_algo=QuantAlgo.FP8)
@ -308,8 +296,6 @@ def test_codellama_fp8_with_bf16_lora() -> None:

@skip_gpu_memory_less_than_80gb
 def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
-    from tensorrt_llm._torch.llm import LLM
-
    model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"

    target_modules = ['attn_q', 'attn_k', 'attn_v']
--- a/tests/unittest/llmapi/test_llm_quant.py
+++ b/tests/unittest/llmapi/test_llm_quant.py
@ -1,6 +1,7 @@
 import pytest

-from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams
 from tensorrt_llm.llmapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig

 # isort: off
--- a/tests/unittest/llmapi/test_llm_utils.py
+++ b/tests/unittest/llmapi/test_llm_utils.py
@ -37,7 +37,8 @@ def test_ModelLoader():

 def test_CachedModelLoader():
    # CachedModelLoader enables engine caching and multi-gpu building
-    args = LlmArgs(model=llama_model_path,
+    args = TrtLlmArgs(
+        model=llama_model_path,
        kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4),
        enable_build_cache=True)
    stats = LlmBuildStats()
@ -51,9 +52,9 @@ def test_CachedModelLoader():

 def test_LlmArgs_default_gpus_per_node():
    # default
-    llm_args = LlmArgs(model=llama_model_path)
+    llm_args = TrtLlmArgs(model=llama_model_path)
    assert llm_args.gpus_per_node == torch.cuda.device_count()

    # set explicitly
-    llm_args = LlmArgs(model=llama_model_path, gpus_per_node=6)
+    llm_args = TrtLlmArgs(model=llama_model_path, gpus_per_node=6)
    assert llm_args.gpus_per_node == 6
--- a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py
+++ b/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py
@ -41,10 +41,10 @@ from helpers import (get_input_tensor_by_name, get_output_config_from_request,
 from mpi4py.futures import MPICommExecutor
 from mpi4py.MPI import COMM_WORLD

+from tensorrt_llm import LLM as PyTorchLLM
 from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch.llm import LLM as PyTorchLLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._utils import global_mpi_rank, global_mpi_size
-from tensorrt_llm.llmapi import LLM
 from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_dict