mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[TRTLLM-5208][BREAKING CHANGE] chore: make pytorch LLM the default (#5312)
Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
parent
113f6fbadd
commit
9bd42ecf9b
@ -11,7 +11,7 @@ The PyTorch backend of TensorRT-LLM is available in version 0.17 and later. You
|
||||
|
||||
## Quick Start
|
||||
|
||||
Here is a simple example to show how to use `tensorrt_llm._torch.LLM` API with Llama model.
|
||||
Here is a simple example to show how to use `tensorrt_llm.LLM` API with Llama model.
|
||||
|
||||
```{literalinclude} ../../examples/pytorch/quickstart.py
|
||||
:language: python
|
||||
@ -24,7 +24,7 @@ The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized
|
||||
which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
|
||||
|
||||
```python
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM
|
||||
llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8')
|
||||
llm.generate("Hello, my name is")
|
||||
```
|
||||
@ -44,7 +44,7 @@ The PyTorch backend supports most of the sampling features that are supported on
|
||||
In order to use this feature, it is necessary to enable option `enable_trtllm_sampler` in the `LLM` class, and pass a `SamplingParams` object with the desired options as well. The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:
|
||||
|
||||
```python
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM
|
||||
llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
|
||||
enable_trtllm_sampler=True)
|
||||
sampling_params = SamplingParams(
|
||||
|
||||
@ -186,7 +186,7 @@ __all__ = [
|
||||
Alternatively, you can register the new model as an out-of-tree model, so that you can use the new model without touching the TensorRT-LLM codebase. To do so, place `modeling_mymodel.py` (and potentially `configuration_mymodel.py`) in your working directory, and import the modeling code in your script:
|
||||
|
||||
```python
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM
|
||||
import modeling_mymodel
|
||||
|
||||
def main():
|
||||
|
||||
@ -5,10 +5,10 @@ Besides TensorRT, PyTorch can also serve as the backend for TensorRT-LLM. This d
|
||||
|
||||
## Top Level API
|
||||
|
||||
The interface for PyTorch backend is `tensorrt._torch.LLM`.
|
||||
The interface for PyTorch backend is `tensorrt_llm.LLM`.
|
||||
|
||||
```python
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM
|
||||
llm = LLM(model=<path_to_llama_from_hf>)
|
||||
```
|
||||
|
||||
|
||||
@ -5,7 +5,8 @@ import click
|
||||
import colorama
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizer
|
||||
|
||||
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
|
||||
|
||||
|
||||
class LlmConsole(code.InteractiveConsole):
|
||||
|
||||
@ -18,8 +18,9 @@ import uvicorn
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.executor import CppExecutorError, RequestError
|
||||
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
|
||||
from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
|
||||
|
||||
TIMEOUT_KEEP_ALIVE = 5 # seconds.
|
||||
|
||||
|
||||
@ -7,11 +7,12 @@ from typing import List, Optional, Union
|
||||
import torch
|
||||
from simple_config import SimpleConfig
|
||||
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm._torch.auto_deploy.models import ModelFactoryRegistry
|
||||
from tensorrt_llm._torch.auto_deploy.shim import DemoLLM
|
||||
from tensorrt_llm._torch.auto_deploy.utils.benchmark import benchmark, store_benchmark_results
|
||||
from tensorrt_llm._torch.auto_deploy.utils.logger import ad_logger
|
||||
from tensorrt_llm.llmapi.llm import LLM, RequestOutput
|
||||
from tensorrt_llm.llmapi.llm import RequestOutput
|
||||
from tensorrt_llm.llmapi.llm_args import TorchCompileConfig
|
||||
from tensorrt_llm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
### Automatic Parallelism with LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
### Generate Text Using Eagle2 Decoding
|
||||
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
|
||||
SamplingParams)
|
||||
|
||||
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
### Generate Text Using Eagle Decoding
|
||||
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
|
||||
SamplingParams)
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
### Generate text with guided decoding
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import GuidedDecodingParams
|
||||
|
||||
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
### Generate text
|
||||
import tempfile
|
||||
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
### Generate Text Asynchronously
|
||||
import asyncio
|
||||
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
### Generate Text in Streaming
|
||||
import asyncio
|
||||
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
### Generate text with customization
|
||||
import tempfile
|
||||
|
||||
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
### Distributed LLM Generation
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
### Get KV Cache Events
|
||||
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import KvCacheConfig
|
||||
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@ from typing import List, Optional
|
||||
|
||||
import torch
|
||||
|
||||
from tensorrt_llm import LLM
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.sampling_params import (BatchedLogitsProcessor,
|
||||
LogitsProcessor, SamplingParams)
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
### Generate Text Using Lookahead Decoding
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
|
||||
LookaheadDecodingConfig, SamplingParams)
|
||||
|
||||
|
||||
|
||||
@ -2,8 +2,8 @@
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
|
||||
MedusaDecodingConfig, SamplingParams)
|
||||
|
||||
|
||||
|
||||
@ -1,8 +1,9 @@
|
||||
### Generate text with multiple LoRA adapters
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from tensorrt_llm import LLM, BuildConfig
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.executor import LoRARequest
|
||||
from tensorrt_llm.llmapi import BuildConfig
|
||||
from tensorrt_llm.lora_manager import LoraConfig
|
||||
|
||||
|
||||
|
||||
@ -3,7 +3,8 @@ import logging
|
||||
|
||||
import torch
|
||||
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import CalibConfig, QuantAlgo, QuantConfig
|
||||
|
||||
major, minor = torch.cuda.get_device_capability()
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@ -32,12 +32,12 @@ from packaging.version import parse
|
||||
from tqdm import tqdm
|
||||
|
||||
import tensorrt_llm
|
||||
from tensorrt_llm._torch import LLM as TORCH_LLM
|
||||
from tensorrt_llm import LLM as TORCH_LLM
|
||||
from tensorrt_llm._tensorrt_engine import LLM as TRT_LLM
|
||||
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
|
||||
from tensorrt_llm.bindings.executor import DecodingConfig
|
||||
from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
|
||||
from tensorrt_llm.llmapi import RequestOutput, SamplingParams
|
||||
from tensorrt_llm.llmapi.llm import LLM as TRT_LLM
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import modeling_opt # noqa
|
||||
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
import argparse
|
||||
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import (DraftTargetDecodingConfig, EagleDecodingConfig,
|
||||
KvCacheConfig, MTPDecodingConfig,
|
||||
NGramDecodingConfig, TorchCompileConfig)
|
||||
|
||||
@ -6,8 +6,7 @@ from difflib import SequenceMatcher
|
||||
|
||||
import torch
|
||||
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
|
||||
|
||||
|
||||
|
||||
@ -46,6 +46,7 @@ from .builder import BuildConfig, Builder, BuilderConfig, build
|
||||
from .disaggregated_params import DisaggregatedParams
|
||||
from .functional import Tensor, constant
|
||||
from .llmapi import LLM, LlmArgs
|
||||
from .llmapi.llm_args import LlmArgs, TorchLlmArgs, TrtLlmArgs
|
||||
from .logger import logger
|
||||
from .mapping import Mapping
|
||||
from .models.automodel import AutoConfig, AutoModelForCausalLM
|
||||
@ -98,6 +99,8 @@ __all__ = [
|
||||
'tools',
|
||||
'LLM',
|
||||
'LlmArgs',
|
||||
'TorchLlmArgs',
|
||||
'TrtLlmArgs',
|
||||
'SamplingParams',
|
||||
'DisaggregatedParams',
|
||||
'KvCacheConfig',
|
||||
|
||||
3
tensorrt_llm/_tensorrt_engine/__init__.py
Normal file
3
tensorrt_llm/_tensorrt_engine/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from tensorrt_llm.llmapi.llm import _TrtLLM as LLM
|
||||
|
||||
__all__ = ['LLM']
|
||||
@ -1,3 +1,4 @@
|
||||
from .llm import LLM
|
||||
from .model_config import MoeLoadBalancerConfig
|
||||
|
||||
__all__ = ["LLM"]
|
||||
__all__ = ["LLM", "MoeLoadBalancerConfig"]
|
||||
|
||||
@ -10,11 +10,12 @@ import torch
|
||||
import torch.multiprocessing as mp
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from ...._tensorrt_engine import LLM
|
||||
from ....executor import GenerationExecutor
|
||||
from ....executor.request import GenerationRequest
|
||||
from ....executor.result import CompletionOutput, GenerationResult
|
||||
from ....inputs.registry import create_input_processor
|
||||
from ....llmapi.llm import LLM, RequestOutput
|
||||
from ....llmapi.llm import RequestOutput
|
||||
from ....llmapi.llm_args import _AutoDeployLlmArgs
|
||||
from ....llmapi.tokenizer import TokenizerBase
|
||||
from ....sampling_params import SamplingParams
|
||||
|
||||
@ -1,3 +1,13 @@
|
||||
from tensorrt_llm.llmapi.llm import _TorchLLM as LLM
|
||||
from tensorrt_llm.llmapi.llm import _TorchLLM
|
||||
|
||||
|
||||
class LLM(_TorchLLM):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
raise ImportError(
|
||||
"_torch.llm is deprecated, please use `from tensorrt_llm import LLM` directly"
|
||||
)
|
||||
|
||||
|
||||
# Keep the LLM class to guide the users to use the default LLM class
|
||||
__all__ = ['LLM']
|
||||
|
||||
@ -10,13 +10,14 @@ import yaml
|
||||
from click_option_group import (MutuallyExclusiveOptionGroup, OptionGroup,
|
||||
optgroup)
|
||||
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.bench.benchmark.utils.asynchronous import async_benchmark
|
||||
from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
|
||||
from tensorrt_llm.bench.benchmark.utils.processes import IterationWriter
|
||||
from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
|
||||
from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
|
||||
from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
|
||||
from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy
|
||||
from tensorrt_llm.llmapi import CapacitySchedulerPolicy
|
||||
from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode
|
||||
|
||||
# isort: off
|
||||
|
||||
@ -17,7 +17,8 @@ from tensorrt_llm.bench.build.build import get_model_config
|
||||
from tensorrt_llm.bench.benchmark.utils.general import (
|
||||
get_settings_from_engine, get_settings)
|
||||
# isort: on
|
||||
from tensorrt_llm._torch.llm import LLM as PyTorchLLM
|
||||
from tensorrt_llm import LLM as PyTorchLLM
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
|
||||
from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
|
||||
from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
|
||||
@ -25,7 +26,7 @@ from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
|
||||
from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
|
||||
initialize_tokenizer,
|
||||
update_metadata_for_multimodal)
|
||||
from tensorrt_llm.llmapi import LLM, CapacitySchedulerPolicy
|
||||
from tensorrt_llm.llmapi import CapacitySchedulerPolicy
|
||||
from tensorrt_llm.logger import logger
|
||||
from tensorrt_llm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
@ -9,7 +9,8 @@ from typing import List, Optional, Set, Tuple
|
||||
from zmq import PUSH
|
||||
from zmq.asyncio import Context
|
||||
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.bench.dataclasses.general import InferenceRequest
|
||||
from tensorrt_llm.bench.dataclasses.reporting import PerfItemTuple, StatsKeeper
|
||||
from tensorrt_llm.executor.postproc_worker import PostprocParams
|
||||
|
||||
@ -9,7 +9,7 @@ from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
|
||||
from tensorrt_llm.bench.utils.data import create_dataset_from_stream, initialize_tokenizer
|
||||
from tensorrt_llm.bench.utils import VALID_QUANT_ALGOS
|
||||
from tensorrt_llm.builder import BuildConfig
|
||||
from tensorrt_llm.llmapi import LLM
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi.llm_utils import QuantConfig
|
||||
from tensorrt_llm.logger import logger
|
||||
from tensorrt_llm.quantization.mode import QuantAlgo
|
||||
|
||||
@ -18,10 +18,11 @@ import click
|
||||
|
||||
import tensorrt_llm.profiler as profiler
|
||||
|
||||
from .._torch.llm import LLM as PyTorchLLM
|
||||
from .. import LLM as PyTorchLLM
|
||||
from .._tensorrt_engine import LLM
|
||||
from ..evaluate import (GSM8K, MMLU, CnnDailymail, GPQADiamond, GPQAExtended,
|
||||
GPQAMain, JsonModeEval)
|
||||
from ..llmapi import LLM, BuildConfig, KvCacheConfig
|
||||
from ..llmapi import BuildConfig, KvCacheConfig
|
||||
from ..llmapi.llm_utils import update_llm_args_with_extra_options
|
||||
from ..logger import logger, severity_map
|
||||
|
||||
|
||||
@ -11,10 +11,11 @@ import yaml
|
||||
from strenum import StrEnum
|
||||
from torch.cuda import device_count
|
||||
|
||||
from tensorrt_llm._torch.llm import LLM as PyTorchLLM
|
||||
from tensorrt_llm import LLM as PyTorchLLM
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm._utils import mpi_rank
|
||||
from tensorrt_llm.executor.utils import LlmLauncherEnvs
|
||||
from tensorrt_llm.llmapi import (LLM, BuildConfig, CapacitySchedulerPolicy,
|
||||
from tensorrt_llm.llmapi import (BuildConfig, CapacitySchedulerPolicy,
|
||||
DynamicBatchConfig, KvCacheConfig,
|
||||
SchedulerConfig)
|
||||
from tensorrt_llm.llmapi.disagg_utils import (CtxGenServerConfig,
|
||||
|
||||
@ -18,8 +18,9 @@ import click
|
||||
import datasets
|
||||
import evaluate
|
||||
|
||||
from .._torch import LLM as PyTorchLLM
|
||||
from ..llmapi import LLM, RequestOutput
|
||||
from .. import LLM as PyTorchLLM
|
||||
from .._tensorrt_engine import LLM
|
||||
from ..llmapi import RequestOutput
|
||||
from ..logger import logger
|
||||
from ..sampling_params import SamplingParams
|
||||
from .interface import Evaluator
|
||||
|
||||
@ -19,8 +19,9 @@ import click
|
||||
import datasets
|
||||
import numpy as np
|
||||
|
||||
from .._torch import LLM as PyTorchLLM
|
||||
from ..llmapi import LLM, RequestOutput
|
||||
from .. import LLM as PyTorchLLM
|
||||
from .._tensorrt_engine import LLM
|
||||
from ..llmapi import RequestOutput
|
||||
from ..logger import logger
|
||||
from ..sampling_params import GuidedDecodingParams, SamplingParams
|
||||
from .interface import Evaluator
|
||||
|
||||
@ -28,8 +28,9 @@ try:
|
||||
except ImportError:
|
||||
TemplateLM = object
|
||||
|
||||
from .._torch import LLM as PyTorchLLM
|
||||
from ..llmapi import LLM, RequestOutput
|
||||
from .. import LLM as PyTorchLLM
|
||||
from .._tensorrt_engine import LLM
|
||||
from ..llmapi import RequestOutput
|
||||
from ..logger import logger
|
||||
from ..sampling_params import SamplingParams
|
||||
from .interface import Evaluator
|
||||
|
||||
@ -40,8 +40,9 @@ import click
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from .._torch import LLM as PyTorchLLM
|
||||
from ..llmapi import LLM, RequestOutput
|
||||
from .. import LLM as PyTorchLLM
|
||||
from .._tensorrt_engine import LLM
|
||||
from ..llmapi import RequestOutput
|
||||
from ..logger import logger
|
||||
from ..sampling_params import SamplingParams
|
||||
from .interface import Evaluator
|
||||
|
||||
@ -2,7 +2,7 @@ from ..disaggregated_params import DisaggregatedParams
|
||||
from ..executor import CompletionOutput, RequestError
|
||||
from ..sampling_params import GuidedDecodingParams, SamplingParams
|
||||
from .build_cache import BuildCacheConfig
|
||||
from .llm import LLM, RequestOutput, _TorchLLM, _TrtLLM
|
||||
from .llm import LLM, RequestOutput
|
||||
# yapf: disable
|
||||
from .llm_args import (BatchingType, CacheTransceiverConfig, CalibConfig,
|
||||
CapacitySchedulerPolicy, ContextChunkingPolicy,
|
||||
@ -50,6 +50,4 @@ __all__ = [
|
||||
'LlmArgs',
|
||||
'TorchLlmArgs',
|
||||
'TrtLlmArgs',
|
||||
'_TrtLLM',
|
||||
'_TorchLLM',
|
||||
]
|
||||
|
||||
@ -97,6 +97,7 @@ TORCH_LLM_DOCSTRING = TORCH_LLMARGS_EXPLICIT_DOCSTRING + """
|
||||
|
||||
Attributes:
|
||||
tokenizer (tensorrt_llm.llmapi.tokenizer.TokenizerBase, optional): The tokenizer loaded by LLM instance, if any.
|
||||
llm_id (str): The unique ID of the LLM instance.
|
||||
"""
|
||||
|
||||
|
||||
@ -883,6 +884,9 @@ class _TorchLLM(BaseLLM):
|
||||
# TODO: deprecate backend in LLM kwargs
|
||||
kwargs.pop("backend", None)
|
||||
|
||||
# Validate that users don't pass TrtLlmArgs-specific arguments
|
||||
self._validate_args_for_torch_backend(kwargs)
|
||||
|
||||
super().__init__(model,
|
||||
tokenizer,
|
||||
tokenizer_mode,
|
||||
@ -895,8 +899,28 @@ class _TorchLLM(BaseLLM):
|
||||
backend='pytorch',
|
||||
**kwargs)
|
||||
|
||||
def _validate_args_for_torch_backend(self, kwargs: dict) -> None:
|
||||
"""Validate that users don't pass TrtLlmArgs-specific arguments when using PyTorch backend.
|
||||
"""
|
||||
trtllm_fields = set(TrtLlmArgs.model_fields.keys())
|
||||
torchllm_fields = set(TorchLlmArgs.model_fields.keys())
|
||||
|
||||
class LLM(_TrtLLM):
|
||||
trtllm_specific_fields = trtllm_fields - torchllm_fields
|
||||
|
||||
# Check if any TrtLlmArgs-specific arguments are passed
|
||||
trtllm_specific_args = []
|
||||
for key in kwargs:
|
||||
if key in trtllm_specific_fields:
|
||||
trtllm_specific_args.append(key)
|
||||
|
||||
if trtllm_specific_args:
|
||||
raise ValueError(
|
||||
f"The following arguments are specific to TensorRT backend and cannot be used with PyTorch backend: {trtllm_specific_args}.\n"
|
||||
f"Please use 'from tensorrt_llm._tensorrt_engine import LLM' instead to use the TensorRT backend."
|
||||
)
|
||||
|
||||
|
||||
class LLM(_TorchLLM):
|
||||
|
||||
def __init__(self,
|
||||
model: Union[str, Path],
|
||||
@ -915,15 +939,13 @@ class LLM(_TrtLLM):
|
||||
revision, tokenizer_revision, **kwargs)
|
||||
|
||||
|
||||
_LLM_REPR = "TrtLLM"
|
||||
_LLM_REPR = "TorchLLM"
|
||||
|
||||
# sphinx will ignore the LLM's docstring if it is not explicitly set
|
||||
LLM.__doc__ = \
|
||||
f"""LLM class is the main class for running a LLM model.
|
||||
|
||||
This class is an alias of {_LLM_REPR}. You can switch between the TensorRT backend
|
||||
and the PyTorch backend by setting the TLLM_USE_TRT_ENGINE environment to 1 or 0.
|
||||
The default backend is the TensorRT backend.
|
||||
This class is an alias of {_LLM_REPR}.
|
||||
|
||||
Parameters:
|
||||
""" + TRT_LLM_DOCSTRING
|
||||
""" + TORCH_LLM_DOCSTRING
|
||||
|
||||
@ -1591,9 +1591,6 @@ class TrtLlmArgs(BaseLlmArgs):
|
||||
return self
|
||||
|
||||
|
||||
LlmArgs = TrtLlmArgs
|
||||
|
||||
|
||||
class LoadFormat(Enum):
|
||||
AUTO = 0
|
||||
# Initialize all weights randomly.
|
||||
@ -1663,7 +1660,10 @@ class TorchLlmArgs(BaseLlmArgs):
|
||||
moe_load_balancer: Optional[Union[object, str]] = Field(
|
||||
default=None,
|
||||
description="Configuration for MoE load balancing.",
|
||||
json_schema_extra={"type": "Union[MoeLoadBalancerConfig, str]"})
|
||||
json_schema_extra={
|
||||
"type":
|
||||
"Union[tensorrt_llm._torch.model_config.MoeLoadBalancerConfig, str, None]"
|
||||
})
|
||||
|
||||
attn_backend: str = Field(default='TRTLLM',
|
||||
description="Attention backend to use.")
|
||||
@ -2081,6 +2081,8 @@ def get_model_format(model_dir: str) -> _ModelFormatKind:
|
||||
return model_format
|
||||
|
||||
|
||||
LlmArgs = TorchLlmArgs
|
||||
|
||||
TRT_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TrtLlmArgs,
|
||||
indent=' ' * 4)
|
||||
TORCH_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TorchLlmArgs,
|
||||
|
||||
@ -4,8 +4,8 @@ from typing import Callable
|
||||
import openai
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.executor import GenerationExecutor
|
||||
from tensorrt_llm.llmapi.llm import LLM
|
||||
from tensorrt_llm.llmapi.llm_args import KvCacheConfig
|
||||
from tensorrt_llm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
@ -14,12 +14,12 @@ from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||
from transformers import AutoConfig, AutoProcessor
|
||||
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
# yapf: disable
|
||||
from tensorrt_llm.executor import CppExecutorError
|
||||
from tensorrt_llm.executor.postproc_worker import PostprocParams
|
||||
from tensorrt_llm.inputs import prompt_inputs
|
||||
from tensorrt_llm.inputs.utils import ConversationMessage, apply_chat_template
|
||||
from tensorrt_llm.llmapi import LLM
|
||||
from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
|
||||
from tensorrt_llm.llmapi.disagg_utils import MetadataServerConfig, ServerRole
|
||||
from tensorrt_llm.llmapi.llm import RequestOutput
|
||||
|
||||
@ -23,10 +23,11 @@ import scipy
|
||||
import yaml
|
||||
|
||||
import tensorrt_llm.evaluate
|
||||
from tensorrt_llm._torch import LLM as PyTorchLLM
|
||||
from tensorrt_llm import LLM as PyTorchLLM
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm._torch.speculative import SpecConfig
|
||||
from tensorrt_llm.builder import BuildConfig
|
||||
from tensorrt_llm.llmapi import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import SamplingParams
|
||||
from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig
|
||||
from tensorrt_llm.logger import logger
|
||||
from tensorrt_llm.models.modeling_utils import QuantConfig
|
||||
|
||||
@ -83,7 +83,6 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
|
||||
yaml.dump(gen_server_config, f)
|
||||
|
||||
args = LlmArgs.from_kwargs(model=model_name,
|
||||
backend="pytorch",
|
||||
tensor_parallel_size=tensor_parallel_size)
|
||||
|
||||
trtllm_serve_path = "trtllm-serve"
|
||||
|
||||
@ -14,7 +14,8 @@
|
||||
# limitations under the License.
|
||||
import pytest
|
||||
|
||||
from tensorrt_llm.llmapi import LLM, EagleDecodingConfig, KvCacheConfig
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
|
||||
from tensorrt_llm.models.modeling_utils import QuantConfig
|
||||
from tensorrt_llm.quantization import QuantAlgo
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
# limitations under the License.
|
||||
import pytest
|
||||
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM
|
||||
from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig
|
||||
from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
|
||||
MTPDecodingConfig, NGramDecodingConfig,
|
||||
|
||||
@ -9,8 +9,7 @@ from defs.conftest import skip_no_hopper
|
||||
from mpi4py import MPI
|
||||
from mpi4py.futures import MPIPoolExecutor
|
||||
|
||||
from tensorrt_llm import DisaggregatedParams, SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, DisaggregatedParams, SamplingParams
|
||||
from tensorrt_llm._utils import set_mpi_comm
|
||||
from tensorrt_llm.llmapi import KvCacheConfig, MpiCommSession
|
||||
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import QuantAlgo, QuantConfig
|
||||
|
||||
prompts = [
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
|
||||
if __name__ == '__main__':
|
||||
prompts = [
|
||||
|
||||
@ -3,7 +3,8 @@ import os
|
||||
|
||||
import click
|
||||
|
||||
from tensorrt_llm.llmapi import LLM, BuildConfig, SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import BuildConfig, SamplingParams
|
||||
|
||||
|
||||
@click.command()
|
||||
|
||||
@ -23,7 +23,7 @@ from defs.common import convert_weights, venv_check_call
|
||||
from defs.conftest import llm_models_root, unittest_path
|
||||
from defs.trt_test_alternative import check_call
|
||||
|
||||
from tensorrt_llm.llmapi import LLM
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi.llm_utils import BuildConfig
|
||||
|
||||
|
||||
|
||||
@ -2107,8 +2107,7 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path,
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.sampling_params import SamplingParams
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
|
||||
@ -2,9 +2,9 @@ from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm._torch.auto_deploy.shim.demollm import DemoLLM
|
||||
from tensorrt_llm._torch.auto_deploy.transformations.transform import InferenceOptimizer
|
||||
from tensorrt_llm.llmapi.llm import LLM
|
||||
from tensorrt_llm.llmapi.llm_args import TorchCompileConfig, _AutoDeployLlmArgs
|
||||
|
||||
# ================================
|
||||
@ -128,7 +128,8 @@ def test_config_flow(
|
||||
|
||||
# Create instance with appropriate mocking
|
||||
with patch.object(api_class, "_try_load_tokenizer", return_value=MagicMock()):
|
||||
instance = api_class(**config_params)
|
||||
with patch.object(api_class, "_build_model", return_value=MagicMock()):
|
||||
instance = api_class(**config_params)
|
||||
|
||||
# Verify args were created correctly
|
||||
assert hasattr(instance, "args")
|
||||
|
||||
@ -7,8 +7,7 @@ import pytest
|
||||
from utils.llm_data import llm_models_root
|
||||
from utils.util import getSMVersion
|
||||
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
|
||||
from tensorrt_llm.llmapi.utils import get_total_gpu_memory
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@ import torch
|
||||
from utils.llm_data import llm_models_root
|
||||
from utils.util import skip_gpu_memory_less_than
|
||||
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM
|
||||
from tensorrt_llm.llmapi import KvCacheConfig
|
||||
from tensorrt_llm.llmapi.llm import RequestOutput
|
||||
from tensorrt_llm.sampling_params import SamplingParams
|
||||
|
||||
@ -2,7 +2,7 @@ import unittest
|
||||
|
||||
from parameterized import parameterized
|
||||
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM
|
||||
from tensorrt_llm.llmapi import KvCacheConfig
|
||||
from tensorrt_llm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
@ -5,8 +5,7 @@ import pytest
|
||||
import torch
|
||||
from utils.llm_data import llm_models_root
|
||||
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import KvCacheConfig
|
||||
from tensorrt_llm.llmapi.utils import get_total_gpu_memory
|
||||
from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
|
||||
|
||||
@ -7,8 +7,7 @@ import torch
|
||||
from utils.llm_data import llm_models_root
|
||||
from utils.util import getSMVersion
|
||||
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import KvCacheConfig
|
||||
from tensorrt_llm.llmapi.utils import get_total_gpu_memory
|
||||
|
||||
|
||||
@ -4,8 +4,7 @@ import pytest
|
||||
import torch
|
||||
from utils.llm_data import llm_models_root
|
||||
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import KvCacheConfig
|
||||
|
||||
|
||||
|
||||
@ -5,8 +5,7 @@ import unittest
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
||||
|
||||
@ -5,8 +5,7 @@ import unittest
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
||||
|
||||
@ -5,8 +5,7 @@ import unittest
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import KvCacheConfig, NGramDecodingConfig
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
||||
|
||||
@ -4,8 +4,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
from utils.llm_data import llm_models_root
|
||||
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
|
||||
|
||||
|
||||
|
||||
@ -5,8 +5,7 @@ import torch
|
||||
from utils.llm_data import llm_models_root
|
||||
from utils.util import force_ampere
|
||||
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi.llm_utils import BuildConfig, KvCacheConfig
|
||||
|
||||
prompts = ["A B C"]
|
||||
|
||||
@ -5,8 +5,7 @@ import pytest
|
||||
from utils.llm_data import llm_models_root
|
||||
from utils.util import similar
|
||||
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
|
||||
|
||||
|
||||
|
||||
@ -17,9 +17,10 @@ import yaml
|
||||
from pydantic import BaseModel
|
||||
|
||||
import tensorrt_llm
|
||||
from tensorrt_llm import LLM
|
||||
from tensorrt_llm.executor import GenerationResult
|
||||
from tensorrt_llm.executor.result import TokenLogprobs
|
||||
from tensorrt_llm.llmapi import (LLM, CalibConfig, CompletionOutput,
|
||||
from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput,
|
||||
GuidedDecodingParams, QuantConfig,
|
||||
RequestOutput, SamplingParams)
|
||||
from tensorrt_llm.llmapi.llm_utils import LlmArgs
|
||||
@ -366,8 +367,14 @@ class ClassSnapshot:
|
||||
if self.properties.keys() != other.properties.keys():
|
||||
diff_keys = set(self.properties.keys()) ^ set(
|
||||
other.properties.keys())
|
||||
this_diff_keys = set(self.properties.keys()) - set(
|
||||
other.properties.keys())
|
||||
other_diff_keys = set(other.properties.keys()) - set(
|
||||
self.properties.keys())
|
||||
raise AssertionError(
|
||||
f"{qual_name} has different properties: {diff_keys}")
|
||||
f"{qual_name} has different properties: {diff_keys}\n"
|
||||
f"This class has extra properties: {this_diff_keys}\n"
|
||||
f"The reference has extra properties: {other_diff_keys}")
|
||||
|
||||
for name, prop in self.properties.items():
|
||||
with StackTrace().push(name):
|
||||
|
||||
@ -1,90 +1,45 @@
|
||||
methods:
|
||||
__init__:
|
||||
parameters:
|
||||
# Parallelism
|
||||
cp_config:
|
||||
annotation: Optional[dict]
|
||||
default: null
|
||||
auto_parallel:
|
||||
annotation: bool
|
||||
default: false
|
||||
auto_parallel_world_size:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
embedding_parallel_mode:
|
||||
annotation: str
|
||||
default: SHARDING_ALONG_VOCAB
|
||||
moe_cluster_parallel_size:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
# Engine building
|
||||
build_config:
|
||||
annotation: Optional[tensorrt_llm.builder.BuildConfig]
|
||||
default: null
|
||||
enable_build_cache:
|
||||
annotation: Union[tensorrt_llm.llmapi.build_cache.BuildCacheConfig, bool]
|
||||
default: false
|
||||
fast_build:
|
||||
annotation: bool
|
||||
default: false
|
||||
# Bindings and mirrored configs
|
||||
batching_type:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_args.BatchingType]
|
||||
default: null
|
||||
peft_cache_config:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
|
||||
default: null
|
||||
scheduler_config:
|
||||
annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
|
||||
default: null
|
||||
extended_runtime_perf_knob_config:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig]
|
||||
default: null
|
||||
decoding_config:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_args.DecodingConfig]
|
||||
default: null
|
||||
cache_transceiver_config:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]
|
||||
default: null
|
||||
# Misc
|
||||
backend:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
enable_attention_dp:
|
||||
annotation: bool
|
||||
default: false
|
||||
normalize_log_probs:
|
||||
annotation: bool
|
||||
default: false
|
||||
gather_generation_logits:
|
||||
annotation: bool
|
||||
default: false
|
||||
# Parallelism
|
||||
gpus_per_node:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
moe_cluster_parallel_size:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
enable_attention_dp:
|
||||
annotation: bool
|
||||
default: False
|
||||
cp_config:
|
||||
annotation: Optional[dict]
|
||||
default: null
|
||||
# Stats
|
||||
iter_stats_max_iterations:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
request_stats_max_iterations:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
workspace:
|
||||
annotation: Optional[str]
|
||||
# Bindings and mirrored configs
|
||||
peft_cache_config:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
|
||||
default: null
|
||||
# LoRA
|
||||
max_lora_rank:
|
||||
annotation: Optional[int]
|
||||
scheduler_config:
|
||||
annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
|
||||
default: null
|
||||
max_loras:
|
||||
annotation: int
|
||||
default: 4
|
||||
max_cpu_loras:
|
||||
annotation: int
|
||||
default: 4
|
||||
allreduce_strategy:
|
||||
annotation: Optional[Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT', 'LOWPRECISION', 'MNNVL']]
|
||||
default: AUTO
|
||||
# postproc worker
|
||||
cache_transceiver_config:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig]
|
||||
default: null
|
||||
batching_type:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_args.BatchingType]
|
||||
default: null
|
||||
normalize_log_probs:
|
||||
annotation: bool
|
||||
default: False
|
||||
gather_generation_logits:
|
||||
annotation: bool
|
||||
default: False
|
||||
num_postprocess_workers:
|
||||
annotation: int
|
||||
default: 0
|
||||
@ -98,10 +53,73 @@ methods:
|
||||
reasoning_parser:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
# kwargs
|
||||
kwargs:
|
||||
annotation: Any
|
||||
default: inspect._empty
|
||||
garbage_collection_gen0_threshold:
|
||||
annotation: int
|
||||
default: 20000
|
||||
# Misc
|
||||
backend:
|
||||
annotation: Optional[str]
|
||||
default: null
|
||||
build_config:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_args.BuildConfig]
|
||||
default: null
|
||||
use_cuda_graph:
|
||||
annotation: bool
|
||||
default: False
|
||||
cuda_graph_batch_sizes:
|
||||
annotation: Optional[List[int]]
|
||||
default: null
|
||||
cuda_graph_max_batch_size:
|
||||
annotation: int
|
||||
default: 0
|
||||
cuda_graph_padding_enabled:
|
||||
annotation: bool
|
||||
default: False
|
||||
disable_overlap_scheduler:
|
||||
annotation: bool
|
||||
default: False
|
||||
moe_max_num_tokens:
|
||||
annotation: Optional[int]
|
||||
default: null
|
||||
moe_load_balancer:
|
||||
annotation: Union[tensorrt_llm._torch.MoeLoadBalancerConfig, str, None]
|
||||
default: null
|
||||
attn_backend:
|
||||
annotation: str
|
||||
default: TRTLLM
|
||||
moe_backend:
|
||||
annotation: str
|
||||
default: CUTLASS
|
||||
mixed_sampler:
|
||||
annotation: bool
|
||||
default: False
|
||||
enable_trtllm_sampler:
|
||||
annotation: bool
|
||||
default: False
|
||||
kv_cache_dtype:
|
||||
annotation: str
|
||||
default: auto
|
||||
enable_iter_perf_stats:
|
||||
annotation: bool
|
||||
default: False
|
||||
enable_iter_req_stats:
|
||||
annotation: bool
|
||||
default: False
|
||||
print_iter_log:
|
||||
annotation: bool
|
||||
default: False
|
||||
torch_compile_config:
|
||||
annotation: Optional[tensorrt_llm.llmapi.llm_args.TorchCompileConfig]
|
||||
default: null
|
||||
autotuner_enabled:
|
||||
annotation: bool
|
||||
default: True
|
||||
enable_layerwise_nvtx_marker:
|
||||
annotation: bool
|
||||
default: False
|
||||
enable_min_latency:
|
||||
annotation: bool
|
||||
default: False
|
||||
return_annotation: None
|
||||
generate:
|
||||
parameters:
|
||||
@ -145,19 +163,10 @@ methods:
|
||||
annotation: Optional[float]
|
||||
default: 2
|
||||
return_annotation: tensorrt_llm.executor.result.IterationResult
|
||||
save:
|
||||
parameters:
|
||||
engine_dir:
|
||||
annotation: str
|
||||
default: inspect._empty
|
||||
return_annotation: None
|
||||
shutdown:
|
||||
parameters: {}
|
||||
return_annotation: None
|
||||
properties:
|
||||
workspace:
|
||||
annotation: pathlib.Path
|
||||
default: inspect._empty
|
||||
llm_id:
|
||||
annotation: str
|
||||
default: inspect._empty
|
||||
|
||||
@ -95,8 +95,8 @@ methods:
|
||||
default: null
|
||||
# Misc
|
||||
load_format:
|
||||
annotation: Literal['auto', 'dummy']
|
||||
default: auto
|
||||
annotation: Union[str, tensorrt_llm.llmapi.llm_args.LoadFormat]
|
||||
default: 0
|
||||
enable_tqdm:
|
||||
annotation: bool
|
||||
default: false
|
||||
@ -106,9 +106,10 @@ methods:
|
||||
kv_cache_config:
|
||||
annotation: tensorrt_llm.llmapi.llm_args.KvCacheConfig
|
||||
default: null
|
||||
garbage_collection_gen0_threshold:
|
||||
annotation: int
|
||||
default: 20000
|
||||
|
||||
kwargs:
|
||||
annotation: Any
|
||||
default: inspect._empty
|
||||
return_annotation: None
|
||||
generate:
|
||||
parameters:
|
||||
|
||||
@ -5,8 +5,10 @@ import pytest
|
||||
from api_stability_core import (ApiStabilityTestHarness, ClassSnapshot,
|
||||
MethodSnapshot)
|
||||
|
||||
from tensorrt_llm import LLM
|
||||
from tensorrt_llm.bindings import executor as tllme
|
||||
from tensorrt_llm.llmapi import (LLM, CalibConfig, CompletionOutput,
|
||||
from tensorrt_llm.executor.result import IterationResult
|
||||
from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput,
|
||||
GuidedDecodingParams, QuantConfig,
|
||||
RequestOutput)
|
||||
from tensorrt_llm.sampling_params import (BatchedLogitsProcessor,
|
||||
@ -130,21 +132,28 @@ class TestLLM(ApiStabilityTestHarness):
|
||||
|
||||
def test_modified_method_with_same_signature(self, mocker):
|
||||
|
||||
def new_save(self, engine_dir: str) -> None:
|
||||
def new_get_stats_async(self,
|
||||
timeout: Optional[float] = 2
|
||||
) -> IterationResult:
|
||||
pass
|
||||
|
||||
new_save.__doc__ = self.TEST_CLASS.save.__doc__
|
||||
new_get_stats_async.__doc__ = self.TEST_CLASS.get_stats_async.__doc__
|
||||
|
||||
mocker.patch.object(self.TEST_CLASS, "save", new=new_save)
|
||||
mocker.patch.object(self.TEST_CLASS,
|
||||
"get_stats_async",
|
||||
new=new_get_stats_async)
|
||||
self.test_signature()
|
||||
self.test_docstring()
|
||||
|
||||
def test_modified_method_with_modified_signature(self, mocker):
|
||||
|
||||
def new_save(self, engine_dir: Optional[str]) -> None:
|
||||
def new_get_stats_async(self,
|
||||
timeout: Optional[int] = 2) -> IterationResult:
|
||||
pass
|
||||
|
||||
mocker.patch.object(self.TEST_CLASS, "save", new=new_save)
|
||||
mocker.patch.object(self.TEST_CLASS,
|
||||
"get_stats_async",
|
||||
new=new_get_stats_async)
|
||||
with pytest.raises(AssertionError):
|
||||
self.test_signature()
|
||||
with pytest.raises(AssertionError):
|
||||
|
||||
@ -9,8 +9,8 @@ import pytest
|
||||
from utils.util import (skip_gpu_memory_less_than_40gb, skip_num_gpus_less_than,
|
||||
skip_nvlink_inactive)
|
||||
|
||||
from tensorrt_llm import LLM
|
||||
from tensorrt_llm.llmapi import BuildConfig
|
||||
from tensorrt_llm.llmapi.llm import LLM
|
||||
|
||||
from ..test_llm import get_model_path
|
||||
from .openai_server import RemoteOpenAIServer
|
||||
|
||||
@ -4,7 +4,7 @@ import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tensorrt_llm._torch.llm import LLM as PyTorchLLM
|
||||
from tensorrt_llm import LLM as PyTorchLLM
|
||||
from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig
|
||||
from tensorrt_llm.serve.openai_server import OpenAIServer
|
||||
|
||||
|
||||
@ -10,8 +10,8 @@ import pytest
|
||||
from utils.util import (skip_gpu_memory_less_than_40gb, skip_pre_ada,
|
||||
skip_single_gpu)
|
||||
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import BuildConfig
|
||||
from tensorrt_llm.llmapi.llm import LLM
|
||||
from tensorrt_llm.llmapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig
|
||||
|
||||
from ..test_llm import get_model_path
|
||||
|
||||
@ -4,7 +4,9 @@ from typing import Optional
|
||||
|
||||
import click
|
||||
|
||||
from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams
|
||||
from tensorrt_llm import LLM as TorchLLM
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams
|
||||
|
||||
|
||||
@click.command()
|
||||
@ -20,7 +22,6 @@ def main(model_dir: str, tp_size: int, engine_dir: Optional[str], n: int,
|
||||
best_of: Optional[int], top_k: int, use_beam_search: bool,
|
||||
use_pytorch: bool):
|
||||
if use_pytorch:
|
||||
from tensorrt_llm._torch.llm import LLM as TorchLLM
|
||||
llm = TorchLLM(
|
||||
model_dir,
|
||||
tensor_parallel_size=tp_size,
|
||||
|
||||
@ -6,9 +6,10 @@ from typing import Optional
|
||||
|
||||
import click
|
||||
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.executor import GenerationResultBase
|
||||
from tensorrt_llm.executor.postproc_worker import PostprocArgs, PostprocParams
|
||||
from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams
|
||||
from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams
|
||||
from tensorrt_llm.llmapi.utils import print_colored
|
||||
from tensorrt_llm.serve.openai_protocol import (
|
||||
ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse,
|
||||
|
||||
@ -10,6 +10,7 @@ import pytest
|
||||
import torch
|
||||
import zmq
|
||||
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm._utils import mpi_world_size
|
||||
from tensorrt_llm.bindings import executor as tllm
|
||||
from tensorrt_llm.executor import (DetokenizedGenerationResultBase,
|
||||
@ -17,7 +18,7 @@ from tensorrt_llm.executor import (DetokenizedGenerationResultBase,
|
||||
GenerationResult, GenerationResultBase,
|
||||
PostprocWorker)
|
||||
from tensorrt_llm.executor.ipc import FusedIpcQueue, ZeroMqQueue
|
||||
from tensorrt_llm.llmapi import LLM, BuildConfig
|
||||
from tensorrt_llm.llmapi import BuildConfig
|
||||
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
|
||||
from tensorrt_llm.llmapi.utils import AsyncQueue
|
||||
from tensorrt_llm.sampling_params import SamplingParams
|
||||
|
||||
@ -25,16 +25,19 @@ import torch
|
||||
import transformers
|
||||
from utils.util import skip_single_gpu
|
||||
|
||||
from tensorrt_llm import LLM as LLM_torch
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.bindings import executor as tllm
|
||||
from tensorrt_llm.executor import (GenerationExecutorWorker, LoRARequest,
|
||||
PromptAdapterRequest, RequestError)
|
||||
from tensorrt_llm.llmapi import (LLM, BuildCacheConfig, EagleDecodingConfig,
|
||||
from tensorrt_llm.llmapi import (BuildCacheConfig, EagleDecodingConfig,
|
||||
KvCacheConfig, KvCacheRetentionConfig,
|
||||
LookaheadDecodingConfig, MedusaDecodingConfig,
|
||||
RequestOutput)
|
||||
from tensorrt_llm.llmapi import TrtLlmArgs as LlmArgs
|
||||
from tensorrt_llm.llmapi.llm_args import DynamicBatchConfig, SchedulerConfig
|
||||
from tensorrt_llm.llmapi.llm_utils import (BuildConfig, LlmArgs, QuantAlgo,
|
||||
QuantConfig, _ParallelConfig)
|
||||
from tensorrt_llm.llmapi.llm_utils import (BuildConfig, QuantAlgo, QuantConfig,
|
||||
_ParallelConfig)
|
||||
from tensorrt_llm.llmapi.tokenizer import TokenizerBase, TransformersTokenizer
|
||||
from tensorrt_llm.llmapi.utils import get_total_gpu_memory
|
||||
from tensorrt_llm.lora_manager import LoraConfig
|
||||
@ -118,7 +121,6 @@ def llm_test_harness(model_dir: str,
|
||||
tokenizer = model_dir
|
||||
|
||||
if backend == "pytorch":
|
||||
from tensorrt_llm._torch import LLM as LLM_torch
|
||||
llm = LLM_torch(model_dir, tokenizer=tokenizer, **llm_kwargs)
|
||||
else:
|
||||
llm = LLM(model_dir, tokenizer=tokenizer, **llm_kwargs)
|
||||
@ -1596,7 +1598,6 @@ def llm_return_logprobs_test_harness(prompt_logprobs: Optional[int],
|
||||
LLM_CLASS = LLM
|
||||
llm_args_extra = {}
|
||||
if backend in ["pytorch", "autodeploy"]:
|
||||
from tensorrt_llm._torch import LLM as LLM_torch
|
||||
LLM_CLASS = LLM_torch
|
||||
else:
|
||||
llm_args_extra["fast_build"] = True
|
||||
@ -1839,7 +1840,6 @@ def llm_get_stats_test_harness(tp_size: int = 1,
|
||||
sampling_args_extra["return_context_logits"] = True
|
||||
|
||||
if pytorch_backend:
|
||||
from tensorrt_llm._torch import LLM as LLM_torch
|
||||
llm_args_extra.update(
|
||||
dict(enable_iter_perf_stats=True,
|
||||
enable_iter_req_stats=enable_iter_req_stats,
|
||||
@ -1894,8 +1894,6 @@ def test_llm_get_queued_stats():
|
||||
llm_args_extra = {}
|
||||
sampling_args_extra = {}
|
||||
|
||||
from tensorrt_llm._torch import LLM as LLM_torch
|
||||
|
||||
llm_args_extra.update(
|
||||
dict(enable_iter_perf_stats=True,
|
||||
enable_iter_req_stats=enable_iter_req_stats,
|
||||
@ -1967,7 +1965,6 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
|
||||
sampling_args_extra["return_context_logits"] = True
|
||||
|
||||
if pytorch_backend:
|
||||
from tensorrt_llm._torch import LLM as LLM_torch
|
||||
llm_args_extra.update(
|
||||
dict(enable_iter_perf_stats=True,
|
||||
enable_iter_req_stats=enable_iter_req_stats,
|
||||
|
||||
@ -5,8 +5,8 @@ import pytest
|
||||
import yaml
|
||||
|
||||
import tensorrt_llm.bindings.executor as tle
|
||||
from tensorrt_llm._torch.llm import LLM as TorchLLM
|
||||
from tensorrt_llm.llmapi.llm import LLM
|
||||
from tensorrt_llm import LLM as TorchLLM
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi.llm_args import *
|
||||
from tensorrt_llm.llmapi.utils import print_traceback_on_error
|
||||
|
||||
@ -54,10 +54,10 @@ speculative_config:
|
||||
f.seek(0)
|
||||
dict_content = yaml.safe_load(f)
|
||||
|
||||
llm_args = LlmArgs(model=llama_model_path)
|
||||
llm_args = TrtLlmArgs(model=llama_model_path)
|
||||
llm_args_dict = update_llm_args_with_extra_dict(llm_args.to_dict(),
|
||||
dict_content)
|
||||
llm_args = LlmArgs(**llm_args_dict)
|
||||
llm_args = TrtLlmArgs(**llm_args_dict)
|
||||
assert llm_args.speculative_config.max_window_size == 4
|
||||
assert llm_args.speculative_config.max_ngram_size == 3
|
||||
assert llm_args.speculative_config.max_verification_set_size == 4
|
||||
@ -226,10 +226,10 @@ class TestTrtLlmArgs:
|
||||
|
||||
def test_dynamic_setattr(self):
|
||||
with pytest.raises(pydantic_core._pydantic_core.ValidationError):
|
||||
args = LlmArgs(model=llama_model_path, invalid_arg=1)
|
||||
args = TrtLlmArgs(model=llama_model_path, invalid_arg=1)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
args = LlmArgs(model=llama_model_path)
|
||||
args = TrtLlmArgs(model=llama_model_path)
|
||||
args.invalid_arg = 1
|
||||
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from tensorrt_llm.llmapi import LLM
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi.utils import (download_hf_model,
|
||||
download_hf_pretrained_config)
|
||||
|
||||
|
||||
@ -2,10 +2,11 @@ import asyncio
|
||||
import time
|
||||
|
||||
import tensorrt_llm
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest
|
||||
from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
|
||||
from tensorrt_llm._utils import KVCacheEventSerializer
|
||||
from tensorrt_llm.llmapi import LLM, KvCacheConfig
|
||||
from tensorrt_llm.llmapi import KvCacheConfig
|
||||
from tensorrt_llm.mapping import Mapping
|
||||
from tensorrt_llm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
@ -8,8 +8,9 @@ from typing import Optional
|
||||
import pytest
|
||||
from parameterized import parameterized
|
||||
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.executor import GenerationExecutorProxy
|
||||
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
|
||||
from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
|
||||
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
|
||||
from tensorrt_llm.mapping import Mapping
|
||||
from tensorrt_llm.models import PretrainedConfig
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import pytest
|
||||
|
||||
from tensorrt_llm import LLM
|
||||
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
|
||||
from tensorrt_llm.sampling_params import SamplingParams
|
||||
|
||||
@ -71,9 +72,7 @@ def test_llm_get_stats_async(return_context_logits, use_overlap,
|
||||
SamplingParams() # pytorch only supports n=1
|
||||
])
|
||||
def test_llm_abort_request(sampling_params):
|
||||
from tensorrt_llm._torch import LLM as LLM_torch
|
||||
llm = LLM_torch(model=llama_model_path,
|
||||
kv_cache_config=global_kvcache_config)
|
||||
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
|
||||
run_llm_abort_request(llm=llm, sampling_params=sampling_params)
|
||||
|
||||
|
||||
@ -82,10 +81,9 @@ def test_llm_reward_model():
|
||||
tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
|
||||
tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"]
|
||||
|
||||
from tensorrt_llm._torch import LLM as LLM_torch
|
||||
llm = LLM_torch(model=rm_model_path,
|
||||
attn_backend="VANILLA",
|
||||
disable_overlap_scheduler=True)
|
||||
llm = LLM(model=rm_model_path,
|
||||
attn_backend="VANILLA",
|
||||
disable_overlap_scheduler=True)
|
||||
|
||||
sampling_params = SamplingParams(return_context_logits=True)
|
||||
|
||||
@ -106,8 +104,6 @@ def test_llm_with_postprocess_parallel_and_result_handler(streaming):
|
||||
|
||||
|
||||
def llama_v2_13b_lora_test_harness(**llm_kwargs) -> None:
|
||||
from tensorrt_llm._torch.llm import LLM
|
||||
|
||||
lora_config = LoraConfig(lora_dir=[
|
||||
f"{llm_models_root()}/llama-models-v2/chinese-llama-2-lora-13b"
|
||||
],
|
||||
@ -134,8 +130,6 @@ def llama_v2_13b_lora_test_harness(**llm_kwargs) -> None:
|
||||
|
||||
|
||||
def llama_7b_multi_lora_test_harness(**llm_kwargs) -> None:
|
||||
from tensorrt_llm._torch.llm import LLM
|
||||
|
||||
hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf"
|
||||
hf_lora_dir1 = f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"
|
||||
hf_lora_dir2 = f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0"
|
||||
@ -181,8 +175,6 @@ def test_llama_v2_13b_lora():
|
||||
|
||||
@skip_gpu_memory_less_than_40gb
|
||||
def test_llama_7b_lora_default_modules() -> None:
|
||||
from tensorrt_llm._torch.llm import LLM
|
||||
|
||||
lora_config = LoraConfig(max_lora_rank=64)
|
||||
|
||||
hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf"
|
||||
@ -214,8 +206,6 @@ def test_llama_7b_multi_lora():
|
||||
# https://jirasw.nvidia.com/browse/TRTLLM-5045
|
||||
@skip_gpu_memory_less_than_138gb
|
||||
def test_nemotron_nas_lora() -> None:
|
||||
from tensorrt_llm._torch.llm import LLM
|
||||
|
||||
lora_config = LoraConfig(lora_dir=[
|
||||
f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64"
|
||||
],
|
||||
@ -248,8 +238,6 @@ def test_nemotron_nas_lora() -> None:
|
||||
|
||||
@skip_gpu_memory_less_than_80gb
|
||||
def test_codellama_fp8_with_bf16_lora() -> None:
|
||||
from tensorrt_llm._torch.llm import LLM
|
||||
|
||||
model_dir = f"{llm_models_root()}/codellama/CodeLlama-7b-Instruct-hf/"
|
||||
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
|
||||
kv_cache_quant_algo=QuantAlgo.FP8)
|
||||
@ -308,8 +296,6 @@ def test_codellama_fp8_with_bf16_lora() -> None:
|
||||
|
||||
@skip_gpu_memory_less_than_80gb
|
||||
def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
|
||||
from tensorrt_llm._torch.llm import LLM
|
||||
|
||||
model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"
|
||||
|
||||
target_modules = ['attn_q', 'attn_k', 'attn_v']
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import pytest
|
||||
|
||||
from tensorrt_llm.llmapi import LLM, KvCacheConfig, SamplingParams
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams
|
||||
from tensorrt_llm.llmapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig
|
||||
|
||||
# isort: off
|
||||
|
||||
@ -37,9 +37,10 @@ def test_ModelLoader():
|
||||
|
||||
def test_CachedModelLoader():
|
||||
# CachedModelLoader enables engine caching and multi-gpu building
|
||||
args = LlmArgs(model=llama_model_path,
|
||||
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4),
|
||||
enable_build_cache=True)
|
||||
args = TrtLlmArgs(
|
||||
model=llama_model_path,
|
||||
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.4),
|
||||
enable_build_cache=True)
|
||||
stats = LlmBuildStats()
|
||||
model_loader = CachedModelLoader(args, llm_build_stats=stats)
|
||||
engine_dir, _ = model_loader()
|
||||
@ -51,9 +52,9 @@ def test_CachedModelLoader():
|
||||
|
||||
def test_LlmArgs_default_gpus_per_node():
|
||||
# default
|
||||
llm_args = LlmArgs(model=llama_model_path)
|
||||
llm_args = TrtLlmArgs(model=llama_model_path)
|
||||
assert llm_args.gpus_per_node == torch.cuda.device_count()
|
||||
|
||||
# set explicitly
|
||||
llm_args = LlmArgs(model=llama_model_path, gpus_per_node=6)
|
||||
llm_args = TrtLlmArgs(model=llama_model_path, gpus_per_node=6)
|
||||
assert llm_args.gpus_per_node == 6
|
||||
|
||||
@ -41,10 +41,10 @@ from helpers import (get_input_tensor_by_name, get_output_config_from_request,
|
||||
from mpi4py.futures import MPICommExecutor
|
||||
from mpi4py.MPI import COMM_WORLD
|
||||
|
||||
from tensorrt_llm import LLM as PyTorchLLM
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch.llm import LLM as PyTorchLLM
|
||||
from tensorrt_llm._tensorrt_engine import LLM
|
||||
from tensorrt_llm._utils import global_mpi_rank, global_mpi_size
|
||||
from tensorrt_llm.llmapi import LLM
|
||||
from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_dict
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user