TensorRT-LLM v0.18.2 release (#3611)

This commit is contained in:
Kaiyu Xie 2025-04-16 14:42:50 +08:00 committed by GitHub
parent 62f3c954b9
commit 5aec7af45f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
54 changed files with 146 additions and 77 deletions

View File

@ -9,7 +9,7 @@ TensorRT-LLM
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-12.8.1-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-10.9.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.18.1-green)](./tensorrt_llm/version.py)
[![version](https://img.shields.io/badge/release-0.18.2-green)](./tensorrt_llm/version.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
[Architecture](./docs/source/architecture/overview.md)   |   [Performance](./docs/source/performance/perf-overview.md)   |   [Examples](./examples/)   |   [Documentation](./docs/source/)   |   [Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)

View File

@ -1,2 +1,2 @@
9f9942768fd5b0cf5ed19860ad539dc9 libtensorrt_llm_ucx_wrapper.so
d2efc6043262c896e262e8d8b97055af0f1f8b47 commit
edf502396e4443f284a5fae6044402478cf457c1 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e015c5cab637202b76f6ccd2d59b1427dc739f10321996a003230ba32814c08b
oid sha256:ee2a324ae76a843823d1d82686bb495d097367e1c3a41aa9596fd0d2ba3fadae
size 8408224

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1205f8fe60cc657645441c14c304888ad7cd68bc4cd1fabd10931a719560a42a
oid sha256:4fcd95de792c72a38d4c1e76a4e714a2c69ffc25f03172075efb432e40ec29cd
size 8374456

View File

@ -1,2 +1,2 @@
e383212a40dca932c7b77bf4544dab80 libtensorrt_llm_ucx_wrapper.so
d2efc6043262c896e262e8d8b97055af0f1f8b47 commit
edf502396e4443f284a5fae6044402478cf457c1 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e2fbf80d02c115b9eeb2c18d24e3cb55f0c5404eba563591abeab7d223518df6
oid sha256:6fd8ea50100bbbdc9d1d52d4b7e9a82f01583884eeb4d2703d537b6785c63ea7
size 3102764

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:27f9acc83aa72979834cc8c216cf06e6ab4e9b10a7d1c9928bac3721fef037bd
oid sha256:cd30000142d1256991fa27644d86dcd12a4c017eab9345a88ac705914aba8d11
size 3145744

View File

@ -1,3 +1,3 @@
61ab1a6d4c62ee2a648f6daa5083c4de libtensorrt_llm_executor_static.a
2f2bc67944c45ce0965704da43c9b1c4 libtensorrt_llm_executor_static.pre_cxx11.a
d2efc6043262c896e262e8d8b97055af0f1f8b47 commit
1146671822817c690387dc77d775b8c7 libtensorrt_llm_executor_static.a
8f7cb0047a0c2690497a97911a60ed6d libtensorrt_llm_executor_static.pre_cxx11.a
edf502396e4443f284a5fae6044402478cf457c1 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ae603dd0c585a7ee601fb6816ac2cdde674d5c49b96c7dce88de2bc67ea727bc
oid sha256:22951d2bb0e5da2a1eb20ae0eb74690ddd57e7f1dd9545762eed1e0f468dd4a5
size 3457520

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7adc486890442df336e145b0ccf982bc733f1a9cc8116f7ce56f1769cf7b1154
oid sha256:0c13a28fc903da20aad74aeb1c3d04a3b1bf91421fdbe85a16b3552b3e7e431b
size 3448406

View File

@ -1,3 +1,3 @@
e5da8cc2936606dfb49f4417d6961060 libtensorrt_llm_executor_static.a
ad5dfb89c2d719d99d67346828e92e25 libtensorrt_llm_executor_static.pre_cxx11.a
d2efc6043262c896e262e8d8b97055af0f1f8b47 commit
34a5173ddebafd3f1621af2717a92f54 libtensorrt_llm_executor_static.a
34eacc123dc995815fbd1e68ec98f78b libtensorrt_llm_executor_static.pre_cxx11.a
edf502396e4443f284a5fae6044402478cf457c1 commit

View File

@ -1,2 +1,2 @@
f3143205203b038b9dca6dd32cf02f59 libtensorrt_llm_nvrtc_wrapper.so
d2efc6043262c896e262e8d8b97055af0f1f8b47 commit
edf502396e4443f284a5fae6044402478cf457c1 commit

View File

@ -1,2 +1,2 @@
770ca93818f3f04837a67353e3f71fbc libtensorrt_llm_nvrtc_wrapper.so
d2efc6043262c896e262e8d8b97055af0f1f8b47 commit
edf502396e4443f284a5fae6044402478cf457c1 commit

View File

@ -1,3 +1,3 @@
6bf0ba4e9b8b1152a21316243d30bec6 libtensorrt_llm_internal_cutlass_kernels_static.a
96f8a359c84a78ba415f4d98ef1c4e1d libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
d2efc6043262c896e262e8d8b97055af0f1f8b47 commit
edf502396e4443f284a5fae6044402478cf457c1 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0b0f621e74dd506e49acd027dc09e9d2a3a6e0117ca0af68254841c02fb9c1dd
size 68126454
oid sha256:447838fe5c798098410a2cfed027aa38df847da2f725b9b8ccec57e73a1e194a
size 68114502

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8448e105a5002148083bacc6c5066e017719ccd532c021f4c821df74fa0b763f
size 68295728
oid sha256:42b88e56cee5b9b81a66836add80ba79819afa24cbfd72140f4d62a244e3f960
size 68295696

View File

@ -1,3 +1,3 @@
0b3322f5047dd4ee549211c2d15483c4 libtensorrt_llm_internal_cutlass_kernels_static.a
502d4901fad6e648b8858051017c4cf2 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
d2efc6043262c896e262e8d8b97055af0f1f8b47 commit
4de75ffa1ff225422ba27f367175448f libtensorrt_llm_internal_cutlass_kernels_static.a
e91d6c762f26c0b158eba8f376914e6e libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
edf502396e4443f284a5fae6044402478cf457c1 commit

View File

@ -5,6 +5,12 @@
All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/).
## TensorRT-LLM Release 0.18.2
### Key Features and Enhancements
- This update addresses known security issues. For the latest NVIDIA Vulnerability Disclosure Information visit https://www.nvidia.com/en-us/security/.
## TensorRT-LLM Release 0.18.1
### Key Features and Enhancements

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.15.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
evaluate~=0.4.1
protobuf

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece>=0.1.99

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
rouge_score~=0.1.2
SentencePiece~=0.1.99

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
transformers>=4.31.0
datasets~=2.14.5
evaluate~=0.4.1

View File

@ -2,7 +2,7 @@
# WAR the new posting of "nvidia-cudnn-cu12~=9.0".
# "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
flax~=0.8.0
# jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
jax~=0.4.19; platform_system == "Windows"

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
rouge_score~=0.1.2
evaluate~=0.4.1

View File

@ -1,5 +1,5 @@
-f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets==2.14.5
rouge_score~=0.1.2
sentencepiece>=0.1.99

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
transformers>=4.43.0
datasets==2.14.6
evaluate~=0.4.1

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece>=0.1.99

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
transformers>=4.39.0
datasets~=2.14.5
evaluate

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece>=0.1.99

View File

@ -1,3 +1,3 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
transformers==4.38.2
accelerate==0.25.0

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
nemo-toolkit[all]==2.0.0rc1
megatron-core==0.8.0
datasets~=2.14.5

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece~=0.1.99

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets>=2.14.4
nemo-toolkit[all]==2.0.0rc1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.16.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.16.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
git+https://github.com/google-deepmind/recurrentgemma.git@8a32e365
flax>=0.8.2
jax~=0.4.23

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece>=0.1.99

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets~=2.16.1
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.18.1
tensorrt_llm==0.18.2
tiktoken
datasets
kaldialign

View File

@ -4,6 +4,8 @@ import concurrent.futures
import copy
import datetime
import faulthandler
import hashlib
import hmac
import io
import json
import multiprocessing
@ -21,7 +23,7 @@ from multiprocessing.shared_memory import SharedMemory
from pathlib import Path
from queue import Empty, Queue
from typing import (Any, Callable, Dict, Generator, List, Literal, NamedTuple,
Optional, Tuple, Union)
Optional, Union)
from weakref import WeakMethod
import numpy as np
@ -1220,14 +1222,20 @@ class ExecutorBindingsWorker(GenerationExecutor):
class ZeroMqQueue:
''' A Queue-like container for IPC using ZeroMQ. '''
def __init__(self, address: Optional[str] = None, *, is_server: bool):
def __init__(self,
address: Optional[tuple[str, Optional[bytes]]] = None,
*,
is_server: bool,
use_hmac_encryption: bool = True):
'''
Parameters:
address (Tuple[str, str], optional): The address (tcp-ip_port, authkey) for the IPC. Defaults to None.
address (tuple[str, Optional[bytes]], optional): The address (tcp-ip_port, hmac_auth_key) for the IPC. Defaults to None. If hmac_auth_key is None and use_hmac_encryption is False, the queue will not use HMAC encryption.
is_server (bool): Whether the current process is the server or the client.
use_hmac_encryption (bool): Whether to use HMAC encryption for pickled data. Defaults to True.
'''
self.address = address or "tcp://127.0.0.1:*"
self.address_endpoint = address[
0] if address is not None else "tcp://127.0.0.1:*"
self.is_server = is_server
self.context = zmq.Context()
self.poller = None
@ -1236,11 +1244,35 @@ class ZeroMqQueue:
self._setup_done = False
self.socket = self.context.socket(zmq.PAIR)
# HMAC encryption setup
self.hmac_key = address[1] if address is not None else None
self.use_hmac_encryption = use_hmac_encryption
# Check HMAC key condition
if self.use_hmac_encryption and self.is_server and self.hmac_key is not None:
raise ValueError(
"Server should not receive HMAC key when encryption is enabled")
elif self.use_hmac_encryption and not self.is_server and self.hmac_key is None:
raise ValueError(
"Client must receive HMAC key when encryption is enabled")
elif not self.use_hmac_encryption and self.hmac_key is not None:
raise ValueError(
"Server and client should not receive HMAC key when encryption is disabled"
)
if self.is_server:
self.socket.bind(
self.address
self.address_endpoint
) # Binds to the address and occupy a port immediately
self.address = self.socket.getsockopt(zmq.LAST_ENDPOINT).decode()
self.address_endpoint = self.socket.getsockopt(
zmq.LAST_ENDPOINT).decode()
if self.use_hmac_encryption:
# Initialize HMAC key for pickle encryption
self.hmac_key = os.urandom(32)
self.address = (self.address_endpoint, self.hmac_key)
def setup_lazily(self):
if self._setup_done:
@ -1248,7 +1280,7 @@ class ZeroMqQueue:
self._setup_done = True
if not self.is_server:
self.socket.connect(self.address)
self.socket.connect(self.address_endpoint)
self.poller = zmq.Poller()
self.poller.register(self.socket, zmq.POLLIN)
@ -1276,14 +1308,34 @@ class ZeroMqQueue:
is_final=obj.is_final,
error=obj.error)
message = pickle.dumps(obj) # nosec B301
self.socket.send(message)
if self.use_hmac_encryption:
# Send pickled data with HMAC appended
data = pickle.dumps(obj) # nosec B301
signed_data = self._sign_data(data)
self.socket.send(signed_data)
else:
# Send data without HMAC
self.socket.send_pyobj(obj)
def get(self) -> Any:
self.setup_lazily()
message = self.socket.recv()
obj = pickle.loads(message) # nosec B301
if self.use_hmac_encryption:
# Receive signed data with HMAC
signed_data = self.socket.recv()
# Split data and HMAC
data = signed_data[:-32]
actual_hmac = signed_data[-32:]
# Verify HMAC
if not self._verify_hmac(data, actual_hmac):
raise RuntimeError("HMAC verification failed")
obj = pickle.loads(data) # nosec B301
else:
# Receive data without HMAC
obj = self.socket.recv_pyobj()
if isinstance(obj, GenerationExecutor.Response):
tensors = self._load_tensors_from_shmm(obj.tensors)
@ -1355,6 +1407,17 @@ class ZeroMqQueue:
cum_log_probs=tensors.cum_log_probs,
)
def _verify_hmac(self, data: bytes, actual_hmac: bytes) -> bool:
"""Verify the HMAC of received pickle data."""
expected_hmac = hmac.new(self.hmac_key, data, hashlib.sha256).digest()
return hmac.compare_digest(expected_hmac, actual_hmac)
def _sign_data(self, data_before_encoding: bytes) -> bytes:
"""Generate HMAC for data."""
hmac_signature = hmac.new(self.hmac_key, data_before_encoding,
hashlib.sha256).digest()
return data_before_encoding + hmac_signature
def __del__(self):
self.close()
@ -1366,7 +1429,7 @@ class FusedIpcQueue:
''' A Queue-like container for IPC with optional message batched. '''
def __init__(self,
address: Optional[str] = None,
address: Optional[tuple[str, Optional[bytes]]] = None,
*,
is_server: bool,
fuse_message=False,
@ -1444,7 +1507,7 @@ class FusedIpcQueue:
return obj
@property
def address(self) -> Tuple[str, int, bytes]:
def address(self) -> tuple[str, Optional[bytes]]:
return self.queue.address
def __del__(self):
@ -1515,10 +1578,10 @@ class ExecutorBindingsProxy(GenerationExecutor):
@staticmethod
def workers_main(
engine: Union[Path, Engine],
request_queue_addr: Tuple[str, int, bytes],
request_error_queue_addr: Tuple[str, int, bytes],
result_queue_addr: Tuple[str, int, bytes],
stats_queue_addr: Tuple[str, int, bytes],
request_queue_addr: tuple[str, Optional[bytes]],
request_error_queue_addr: tuple[str, Optional[bytes]],
result_queue_addr: tuple[str, Optional[bytes]],
stats_queue_addr: tuple[str, Optional[bytes]],
executor_config: Optional[tllm.ExecutorConfig] = None,
logits_post_processor_map: Optional[Dict[str, Callable]] = None,
worker_cls: type = ExecutorBindingsWorker,

View File

@ -12,4 +12,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.18.1"
__version__ = "0.18.2"