TensorRT-LLM v0.18 release (#3231)

This commit is contained in:
Kaiyu Xie 2025-04-02 17:01:16 +08:00 committed by GitHub
parent 258c7540c0
commit 3c0462002c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
113 changed files with 1001 additions and 707 deletions

View File

@ -4,7 +4,7 @@ repos:
hooks:
- id: isort
- repo: https://github.com/Lucas-C/pre-commit-hooks.git
rev: v1.1.13
rev: v1.5.5
hooks:
- id: remove-crlf
- repo: https://github.com/google/yapf
@ -24,7 +24,7 @@ repos:
- id: check-yaml
- id: trailing-whitespace
- repo: https://github.com/PyCQA/autoflake
rev: v1.6.1
rev: v2.3.1
hooks:
- id: autoflake
args: ['--in-place', '--remove-all-unused-imports', '--remove-unused-variables']

View File

@ -7,9 +7,9 @@ TensorRT-LLM
[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
[![python](https://img.shields.io/badge/python-3.12-green)](https://www.python.org/downloads/release/python-3123/)
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-12.8.0-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-10.8.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.17.0-green)](./tensorrt_llm/version.py)
[![cuda](https://img.shields.io/badge/cuda-12.8.1-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-10.9.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.18.0-green)](./tensorrt_llm/version.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
[Architecture](./docs/source/architecture/overview.md)   |   [Performance](./docs/source/performance/perf-overview.md)   |   [Examples](./examples/)   |   [Documentation](./docs/source/)   |   [Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)

View File

@ -94,6 +94,7 @@ private:
[[nodiscard]] SizeType32 getMaxSequenceLen() const;
[[nodiscard]] SizeType32 getMaxNumSequences() const;
[[nodiscard]] SizeType32 getMaxDraftLen() const;
[[nodiscard]] SizeType32 getVocabSizePadded() const;
void validateLlmRequest(
LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;

View File

@ -26,8 +26,6 @@
namespace tensorrt_llm::batch_manager
{
namespace tle = tensorrt_llm::executor;
class AllocateKvCache : Algorithm
{
using BaseKVCacheManager = tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager;

View File

@ -56,7 +56,7 @@ public:
struct ExpiringBlockComparator
{
inline bool operator()(BlockPtr const& a, BlockPtr const& b) const
bool operator()(BlockPtr const& a, BlockPtr const& b) const
{
// If two blocks expire in the same millisecond, their expiration times will be equal. As a fallback, check the
// raw pointer values.

View File

@ -166,6 +166,8 @@ class KVCacheBlock
public:
using IdType = std::int32_t;
static constexpr IdType kCachedBlocksRootId = -1;
explicit KVCacheBlock(IdType blockId, kernels::KVCacheIndex blockIdx);
void startScheduling();
@ -379,6 +381,16 @@ public:
return mKvCacheRetentionConfig.getDecodeDurationMs();
}
[[nodiscard]] bool getContextRequiresCyclicKvCache() const
{
return mContextRequiresCyclicKvCache;
}
void setContextRequiresCyclicKvCache(bool contextRequiresCyclicKvCache)
{
mContextRequiresCyclicKvCache = contextRequiresCyclicKvCache;
}
private:
// Request id of the sequence
LlmRequest::RequestIdType mRequestId;
@ -392,6 +404,9 @@ private:
runtime::ITensor::SharedPtr mCacheBlockIndices;
// The retention priority to assign to decode blocks
executor::KvCacheRetentionConfig mKvCacheRetentionConfig;
// A value indicating whether or not the context is long enough to warrant the use of cyclic kv-cache.
bool mContextRequiresCyclicKvCache{false};
};
// attach metadata to a pool pointer
@ -443,7 +458,7 @@ public:
SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
CacheType cacheType = CacheType::kSELF,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
std::shared_ptr<KVCacheEventManager> eventManager = nullptr);
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false);
~BlockManager();
@ -712,6 +727,9 @@ private:
SizeType32 mMissedBlocks;
std::set<KVCacheBlock::IdType> reusedBlockIds;
// Whether or not to maintain a hashmap of blocks.
bool mEnableHashKey;
private:
friend class KVCacheManager;
};
@ -818,16 +836,18 @@ public:
//! \details These blocks become reusable from next step.
virtual void storeContextBlocks(LlmRequest const& llmRequest) = 0;
virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0;
[[nodiscard]] virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0;
virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(LlmRequest::RequestIdType requestId) const = 0;
[[nodiscard]] virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
LlmRequest::RequestIdType requestId) const
= 0;
virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
[[nodiscard]] virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
std::vector<LlmRequest::RequestIdType> const& requestIds) const
= 0;
virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
[[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
[[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
virtual void refreshBlocks() = 0;
virtual void flushIterationEvents() = 0;
@ -846,7 +866,7 @@ public:
* 2 * modelConfig.getSizePerHead();
}
[[nodiscard]] static std::tuple<SizeType32, SizeType32> const calculateMaxNumBlocks(KvCacheConfig const& config,
[[nodiscard]] static std::tuple<SizeType32, SizeType32> calculateMaxNumBlocks(KvCacheConfig const& config,
nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,
tensorrt_llm::runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
@ -924,7 +944,7 @@ public:
return mBlockManager.getNumFreeBlocks();
}
[[nodiscard]] virtual SizeType32 getNumPools() const override
[[nodiscard]] SizeType32 getNumPools() const override
{
return mBlockManager.getNumPools();
}
@ -994,8 +1014,6 @@ public:
/// @return The number of blocks
[[nodiscard]] SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const override;
void addContextTokens(LlmRequest::RequestIdType requestId, SizeType32 numTokens);
/// @brief Increase size for request with requestId. Allocate new KV cache block(s) if needed.
void addToken(LlmRequest::RequestIdType requestId) override;

View File

@ -91,7 +91,7 @@ private:
runtime::ITensor::SharedPtr mPool;
runtime::ITensor::SharedPtr mCurrent;
const std::vector<SizeType32> mBlockIds;
std::vector<SizeType32> const mBlockIds;
size_t mIdx;
};

View File

@ -490,9 +490,14 @@ public:
initialize(req.getInputTokenIds(), req.getOutputConfig().returnLogProbs);
}
void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded,
std::optional<SizeType32> maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false)
{
if (mEndId.has_value())
{
TLLM_CHECK_WITH_INFO(*mEndId >= -1 && *mEndId < vocabSizePadded,
"EndId (%d) is not within acceptable range [-1, %d).", *mEndId, vocabSizePadded);
}
TLLM_CHECK_WITH_INFO(!(maxEncoderInputLen.has_value() && getEncoderInputLen() > maxEncoderInputLen.value()),
"Encoder length (%d) exceeds maximum encoder input length (%d).", getEncoderInputLen(),
maxEncoderInputLen.value());

View File

@ -21,6 +21,7 @@
#include "tensorrt_llm/layers/defaultDecodingParams.h"
#include "tensorrt_llm/runtime/common.h"
#include <algorithm>
#include <functional>
#include <optional>
#include <vector>

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:54aeaec28cc8cd7e5f62829fecf5af5be192e906333b108028af951fc6b6346d
size 9125406
oid sha256:2d361766d0a13d5d88071e546f5d7ca51fef92300fcc7b261337c638746cbff1
size 9123884

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4d5c2aac4ca61d80f8a61d06a7e17fcfb0c9428a3dd89ade705f14cf651acd4b
size 9169292
oid sha256:e7a942b813cd05c5d21c82a7e5b5988227988668bf960b3a954443998b4e2a2b
size 9167324

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c067e858d968710bfe4575694ff5cabf1fb9d29924f6bd4cee552c4fd485a8ca
size 2026192
oid sha256:a11179efe519b2b001d6021c7cbea949c81b3618395fa2ce44da9b09d7d35d14
size 2029704

View File

@ -1,2 +1,2 @@
ca50ae76421863dfebf6080b7f4f6b29 libtensorrt_llm_ucx_wrapper.so
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
9f9942768fd5b0cf5ed19860ad539dc9 libtensorrt_llm_ucx_wrapper.so
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:092faef60d09d008331b2b09453d89563b5fe6c49a6a195051ad51ca80f79277
size 8407972
oid sha256:e2ec997b71d8b990eecc0034930b24319916ed3615a618f982e1b780d7643bc6
size 8408224

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4d00bce3aa2f555f98cb2213d183a3174bb083cd62179ac65dce24d75bd648eb
size 8374854
oid sha256:2662dfb4833b41f71f4ef9cfe6a46d6312a04065c2343a899e1476102019a180
size 8374456

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ac4ad59907a322e6fbb85b9e88cea587cc4f5e4018017726895f64bd800c8552
oid sha256:1a1d23d99e2b91fa11ee3e1fb94179c0731065059eb981da9434a42d98ffa4d8
size 15592

View File

@ -1,2 +1,2 @@
0ec83a0451530fcf0e3f325cdc185043 libtensorrt_llm_ucx_wrapper.so
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
e383212a40dca932c7b77bf4544dab80 libtensorrt_llm_ucx_wrapper.so
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit

View File

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a64b3c16150b34ad6437862eaf95c5c35acdf69facc40af14bc0632a16b7d162
size 54093198

View File

@ -1,2 +0,0 @@
41725f3b40ca44954bb9de6e7dcbfd2c tensorrt_llm_batch_manager_static.lib
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4d333952a574e9056a0f01f788f9c053d92a7a9bc988a335df663697405f5659
size 3102572
oid sha256:d6ef115e34695dd0bec9df6069dd2e95615f401546ce275b133145fdb7568c6c
size 3102764

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:aa6dcdbe9501429192102c1094be664d0ab07199acc4882ab061eb48b699d83c
size 3145248
oid sha256:3bc68d4aec21a361954dd144084edb050e19390b87d6668f88b9e7f110f717a0
size 3145744

View File

@ -1,3 +1,3 @@
15c05b1921f3f8cbb7bc1e53f189c661 libtensorrt_llm_executor_static.a
b586d90eac1293ea656ff2db8a35cd92 libtensorrt_llm_executor_static.pre_cxx11.a
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
288c6aa23b993d60d758107188c37d78 libtensorrt_llm_executor_static.a
20e46fb4b5b23a0f27eb3f8dd7d4d3bf libtensorrt_llm_executor_static.pre_cxx11.a
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a4b144867eb79d269db173338ccae8dc6c68a8374a49fe17a555034ac433f46f
size 3457528
oid sha256:5b819d5cf3f7d9bd1ee69427db4b7ce3eb9c17f9e2cfa319540dad51ed6648e7
size 3457520

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a7b5f70dda7b8042d09098859a4bd1851159d3c750da97f3bfd55bf713c7a1cc
size 3447846
oid sha256:9f123e25ff2e046afff092b082a60b461f3f13853630857bd166e5e8a084e1ee
size 3448406

View File

@ -1,3 +1,3 @@
d573456348a96fd7a97aa832f1113986 libtensorrt_llm_executor_static.a
b3ba7776e3b5bb6e750e8412fc3b5c60 libtensorrt_llm_executor_static.pre_cxx11.a
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
eebaf66c6ac42645a9bf12a0b013ac4b libtensorrt_llm_executor_static.a
1fc4cc62abfb31ad51f6ae3620641c04 libtensorrt_llm_executor_static.pre_cxx11.a
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit

View File

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3722133c17bdad38c3bbd3a5caa4eafbe17805d3a7fa784f10c313902020a13b
size 26350954

View File

@ -1,2 +0,0 @@
5beaadd32fc3dd25770746016b293229 tensorrt_llm_executor_static.lib
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:41cb6788cd975f0f2ef1de0bdff9d3178280a3c2683abb11929e0d5ccecc9d76
size 126752312
oid sha256:f82db62eaeeb8a02d44b4cad97ae050cc447eea8a3e48b03f56f6040d1aaccc8
size 126824176

View File

@ -1,2 +1,2 @@
200a2d19469277b9906a00f7da83fd04 libtensorrt_llm_nvrtc_wrapper.so
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
f3143205203b038b9dca6dd32cf02f59 libtensorrt_llm_nvrtc_wrapper.so
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:738668482149971eaa849b863360e21fe66781c5eeaadab8263c83a3b67637dc
size 133824576
oid sha256:bdb100ae1f96025c5dd7e977cbae05005c2b2b3f36b902edc3a4f24d85ec3731
size 133867944

View File

@ -1,2 +1,2 @@
20761e50ba44b91a7a2d1f8d1c5c780b libtensorrt_llm_nvrtc_wrapper.so
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
770ca93818f3f04837a67353e3f71fbc libtensorrt_llm_nvrtc_wrapper.so
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit

View File

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5926bdebb3d02686d81b3e29edbdf3fb89e44f1518ae187a66284175fb980613
size 1230336

View File

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:97eb854ba561c25d46c3e9a074dc5a8ba873923fd28d649c869d9db450e26a8a
size 3488

View File

@ -1,3 +0,0 @@
3082017cee538017c343567d938bb106 tensorrt_llm_nvrtc_wrapper.lib
b9b4bf6a2d38abae1d3e038ad73b3890 tensorrt_llm_nvrtc_wrapper.dll
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2c9e20ddfa9d8b200a595a3a59afd53356afe4ad1b030f0b2cf446cac7271c58
size 53382370
oid sha256:cb21d66bb8b8eec2f6e11696b2b9b4b629b92ab299fec6702f2102277bb453bf
size 53355202

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b615014ee1f3e9ef8fd498a72ef8f699b92727177416c05d08bbf1c95d3ff52e
size 53479636
oid sha256:7be13e39772baa2ade76d60407fa0d56ecca58c39b24e020f1f0b58c0eede5f0
size 53469348

View File

@ -1,3 +1,3 @@
32e01331abfcacf5b70854104ca4bf20 libtensorrt_llm_internal_cutlass_kernels_static.a
662d22cefd410c1851ac701e6e3bcbbf libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
6bf0ba4e9b8b1152a21316243d30bec6 libtensorrt_llm_internal_cutlass_kernels_static.a
96f8a359c84a78ba415f4d98ef1c4e1d libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:baf28ca8666062b417d251b608f31227cfac15676aa0bbbaacb1263befc9d408
size 68138662
oid sha256:d5cb27ba31185f16333b697bf9d913015ee85508e96aea2248162f3ff9a618b9
size 68126454

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:05358d9334e8976d463a40b4c6b0b5d780561556cb89194a8f243b0b69f59f33
size 68308080
oid sha256:c22c8b6856111183fc44fb11c8843ea8506f7297a97fee1e9a98414f9526118a
size 68295728

View File

@ -1,3 +1,3 @@
d8af682c4274543b06992255e727f52e libtensorrt_llm_internal_cutlass_kernels_static.a
c797baf2a0a7538eb8f75e0f898ae208 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
64df74edb7e87b69478e4f9a2c0b3bb7 libtensorrt_llm_internal_cutlass_kernels_static.a
68a43f555a5b930950a436ebb54a1267 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit

View File

@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b3856b1a5ae0f8f8e489f79d6396f2cc2e823536728b042082892502bcd33d76
size 246691546

View File

@ -1,2 +0,0 @@
ab51496e515622f560f4b989ed1d7e63 tensorrt_llm_internal_cutlass_kernels_static.lib
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit

View File

@ -213,8 +213,9 @@ __global__ void batchApplyPenalty(T const* const* inputLogits, T* outputLogits,
if (hasMinLength)
{
__syncthreads();
// Min length
if ((threadIdx.x == 0) && (currentStep - inputLen < minLength))
// If current generation length is too short, make sure EOS doesn't have high probability.
// This check is not needed when endId is already -1 as generation won't stop on EOS anyway.
if ((threadIdx.x == 0) && (currentStep - inputLen < minLength) && endIds[batchSlot] > -1)
{
outLogitsPtr[endIds[batchSlot]] = MASK_VAL;
}

View File

@ -79,7 +79,8 @@ __global__ void topKStage1(T const* __restrict logProbs, T const* const* __restr
if (tid < k && endIds != nullptr) // if returnAllSelectedToken, endIds would not be an input
{
auto const index = tmpTopKBufIndex + tid;
if (blockLane == 0 && tid == 0)
// endId=-1 means generation doesn't stop upon encountering a certain token.
if (blockLane == 0 && tid == 0 && endIds[batchSlot] > -1)
{
auto const endId = endIds[batchSlot];
topKTmpIdBuf[index] = tmpLogBufIndex + endId;

View File

@ -252,29 +252,49 @@ __device__ __forceinline__ void MULTIMEM_ST2(ValType& val, PtrType ptr)
asm volatile("multimem.st.global.v2.f32 [%0], {%1,%2};" ::"l"(ptr), "r"(val.x), "r"(val.y) : "memory");
}
template <typename DType, typename ValType, typename PtrType>
template <typename DType, bool const DISABLE_FP32_ACC, typename ValType, typename PtrType>
__device__ __forceinline__ void MULTIMEM_LD(ValType& val, PtrType ptr)
{
if constexpr (std::is_same_v<DType, half>)
{
asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];"
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
: "l"(ptr)
: "memory");
if (!DISABLE_FP32_ACC)
{
asm("multimem.ld_reduce.global.add.v4.f16x2.acc::f32 {%0,%1,%2,%3}, [%4];"
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
: "l"(ptr)
: "memory");
}
else
{
asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];"
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
: "l"(ptr)
: "memory");
}
}
#ifdef ENABLE_BF16
if constexpr (std::is_same_v<DType, __nv_bfloat16>)
{
asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];"
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
: "l"(ptr)
: "memory");
if (!DISABLE_FP32_ACC)
{
asm("multimem.ld_reduce.global.add.v4.bf16x2.acc::f32 {%0,%1,%2,%3}, [%4];"
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
: "l"(ptr)
: "memory");
}
else
{
asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];"
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
: "l"(ptr)
: "memory");
}
}
#endif
}
// All MC kernels here
template <typename DType, int RANKS>
template <typename DType, int RANKS, bool DISABLE_FP32_ACC>
__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc(int const op, int const flagoffset,
int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
void** commbuff, int const handleridx, float4* mc_ptr)
@ -310,7 +330,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
uint4 val[UNROLL_MC];
#pragma unroll
for (int i = 0; i < UNROLL_MC; i++)
MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
#pragma unroll
for (int i = 0; i < UNROLL_MC; i++)
MULTIMEM_ST(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
@ -318,7 +338,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
for (int line = end_aligned; line < end_elem; line += loop_step0)
{
uint4 val;
MULTIMEM_LD<DType>(val, mc_ptr + (lineoffset + line));
MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val, mc_ptr + (lineoffset + line));
MULTIMEM_ST(val, mc_ptr + (lineoffset + line));
}
__syncthreads();
@ -336,7 +356,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
} // fp16 inplace reduce kernel (Hopper) MC
#else
template <typename DType, int RANKS>
template <typename DType, int RANKS, bool DISABLE_FP32_ACC>
__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc(int const op, int const flagoffset,
int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
void** commbuff, int const handleridx, float4* mc_ptr)
@ -382,8 +402,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
reinterpret_cast<void*>(&arg3), reinterpret_cast<void*>(&arg4), reinterpret_cast<void*>(&arg5), \
reinterpret_cast<void*>(&arg6), reinterpret_cast<void*>(&arg7), reinterpret_cast<void*>(&arg8), \
reinterpret_cast<void*>(&arg9), reinterpret_cast<void*>(&arg10)}; \
TLLM_CUDA_CHECK( \
cudaLaunchKernelExC(&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc<DType, x>), kernelArgs)); \
TLLM_CUDA_CHECK(cudaLaunchKernelExC( \
&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc<DType, x, DISABLE_FP32_ACC>), kernelArgs)); \
}
struct LaunchConfig
@ -529,7 +549,7 @@ __device__ uint32_t cvt_warp_fp16_to_fp4_mc(PackedVec<Type>& vec, float SFScaleV
#endif
}
template <typename DType, int UNROLL_NLINES>
template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
__global__ void __launch_bounds__(MAX_THREADS)
userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4(int const op, int const flagoffset, int const firstrank,
int const myrank, int const gpustep, size_t const lineoffset, int const numlines, void** commbuff,
@ -574,7 +594,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
DType* x = reinterpret_cast<DType*>(&val[0]);
#pragma unroll
for (int i = 0; i < UNROLL_NLINES; i++)
MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
if (residual_in != nullptr)
{
@ -643,7 +663,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
#endif
}
template <typename DType, int UNROLL_NLINES>
template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
__global__ void __launch_bounds__(MAX_THREADS)
userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot(int const op, int const flagoffset,
int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
@ -687,7 +707,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
DType* x = reinterpret_cast<DType*>(&val[0]);
#pragma unroll
for (int i = 0; i < UNROLL_NLINES; i++)
MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
if (residual_in != nullptr)
{
@ -744,7 +764,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
#if __CUDA_ARCH__ >= 900
template <typename DType, int UNROLL_NLINES>
template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant(int const op,
int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
@ -786,7 +806,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
DType* x = reinterpret_cast<DType*>(&val[0]);
#pragma unroll
for (int i = 0; i < UNROLL_NLINES; i++)
MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
if (residual_in != nullptr)
{
@ -848,7 +868,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
*reduceidptr = reduce_id;
} // quant kernel fp16->fp8 twoshot
template <typename DType, int UNROLL_NLINES>
template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot(int const op,
int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
@ -889,7 +909,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
DType* x = reinterpret_cast<DType*>(&val[0]);
#pragma unroll
for (int i = 0; i < UNROLL_NLINES; i++)
MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
if (residual_in != nullptr)
{
@ -997,7 +1017,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
} // residual allgather kernel
#else
template <typename DType, int UNROLL_NLINES>
template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant(int const op,
int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
@ -1018,7 +1038,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
asm volatile("brkpt;\n");
}
template <typename DType, int UNROLL_NLINES>
template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot(int const op,
int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
@ -1059,8 +1079,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
reinterpret_cast<void*>(&arg12), reinterpret_cast<void*>(&arg13), reinterpret_cast<void*>(&arg14), \
reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17), \
reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20)}; \
TLLM_CUDA_CHECK(cudaLaunchKernelExC( \
&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant<DType, x>), kernelArgs)); \
TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg, \
(void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant<DType, x, DISABLE_FP32_ACC>), kernelArgs)); \
}
#define callranksMC_RMSNORM_QUANT_ONESHOT(x) \
@ -1091,8 +1111,9 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
reinterpret_cast<void*>(&arg12), reinterpret_cast<void*>(&arg13), reinterpret_cast<void*>(&arg14), \
reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17), \
reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20)}; \
TLLM_CUDA_CHECK(cudaLaunchKernelExC( \
&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot<DType, x>), kernelArgs)); \
TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg, \
(void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot<DType, x, DISABLE_FP32_ACC>), \
kernelArgs)); \
}
#define callranksMC_RMSNORM_QUANT_FP4(x) \
@ -1127,8 +1148,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17), \
reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20), \
reinterpret_cast<void*>(&arg21), reinterpret_cast<void*>(&arg22), reinterpret_cast<void*>(&arg23)}; \
TLLM_CUDA_CHECK(cudaLaunchKernelExC( \
&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4<DType, x>), kernelArgs)); \
TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg, \
(void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4<DType, x, DISABLE_FP32_ACC>), kernelArgs)); \
}
#define callranksMC_RMSNORM_QUANT_FP4_ONESHOT(x) \
@ -1163,8 +1184,9 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17), \
reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20), \
reinterpret_cast<void*>(&arg21), reinterpret_cast<void*>(&arg22)}; \
TLLM_CUDA_CHECK(cudaLaunchKernelExC( \
&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot<DType, x>), kernelArgs)); \
TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg, \
(void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot<DType, x, DISABLE_FP32_ACC>), \
kernelArgs)); \
}
#define callranksMC_RES_AG(x) \
if (nlines == x) \
@ -1189,7 +1211,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_res_allgather<DType, x>), kernelArgs)); \
}
template <typename DType>
template <typename DType, bool DISABLE_FP32_ACC>
int allreduce2_userbuff_inplace_gpu(int const maxcredit, int const handler, size_t const offset, size_t const elements,
int const blocksize, communicator* comm, cudaStream_t stream, int op)
{
@ -1226,7 +1248,7 @@ int allreduce2_userbuff_inplace_gpu(int const maxcredit, int const handler, size
return sms;
}
template <typename DType>
template <typename DType, bool DISABLE_FP32_ACC>
void allreduce_nonsharp_inplace(
int const handler, size_t const offset, size_t const elements, communicator* comm, cudaStream_t stream, int op)
{
@ -1234,14 +1256,25 @@ void allreduce_nonsharp_inplace(
return;
int blocksize = elements * 2;
int maxcredit = 0;
int sms = allreduce2_userbuff_inplace_gpu<DType>(maxcredit, handler, offset, elements, blocksize, comm, stream, op);
int sms;
if (DISABLE_FP32_ACC)
{
sms = allreduce2_userbuff_inplace_gpu<DType, true>(
maxcredit, handler, offset, elements, blocksize, comm, stream, op);
}
else
{
sms = allreduce2_userbuff_inplace_gpu<DType, false>(
maxcredit, handler, offset, elements, blocksize, comm, stream, op);
}
}
template <typename DType>
template <typename DType, bool DISABLE_FP32_ACC>
void allreduce2_userbuff_inplace(
int const handler, size_t const offset, size_t const elements, communicator* comm, cudaStream_t stream)
{
allreduce_nonsharp_inplace<DType>(handler, offset, elements, comm, stream, userbuffers_allreduceop_nonsharp2);
allreduce_nonsharp_inplace<DType, DISABLE_FP32_ACC>(
handler, offset, elements, comm, stream, userbuffers_allreduceop_nonsharp2);
}
bool use_oneshot_kernel(communicator* comm, size_t elements, int hidden_size)
@ -1262,7 +1295,7 @@ bool use_oneshot_kernel(communicator* comm, size_t elements, int hidden_size)
}
}
template <typename DType>
template <typename DType, bool DISABLE_FP32_ACC>
int allreduce2_userbuff_inplace_rmsnorm_quant(int const handler, size_t const offset, int const out_handler,
size_t const out_offset, size_t const elements, int const hidden_size, void* beta, void* gamma, float eps,
float* scalefactor, void* residual_in, void* residual_out, communicator* comm, cudaStream_t stream)
@ -1315,7 +1348,7 @@ int allreduce2_userbuff_inplace_rmsnorm_quant(int const handler, size_t const of
return sms;
}
template <typename DType>
template <typename DType, bool DISABLE_FP32_ACC>
int allreduce2_userbuff_inplace_rmsnorm_quant_fp4(int const handler, size_t const offset, int const out_handler,
size_t const out_offset, int const scale_handler, size_t const scale_offset, size_t const elements,
int const hidden_size, void* beta, void* gamma, float eps, float* scalefactor, void* residual_in,
@ -1422,11 +1455,31 @@ void allreduce2_userbuff_inplace_impl(int const handler, size_t const offset, si
{
switch (dataType)
{
case nvinfer1::DataType::kHALF: allreduce2_userbuff_inplace<half>(handler, offset, elements, comm, stream); break;
case nvinfer1::DataType::kHALF:
{
if (kDISABLE_FP32_ACCUMULATION)
{
allreduce2_userbuff_inplace<half, true>(handler, offset, elements, comm, stream);
}
else
{
allreduce2_userbuff_inplace<half, false>(handler, offset, elements, comm, stream);
}
break;
}
#ifdef ENABLE_BF16
case nvinfer1::DataType::kBF16:
allreduce2_userbuff_inplace<__nv_bfloat16>(handler, offset, elements, comm, stream);
{
if (kDISABLE_FP32_ACCUMULATION)
{
allreduce2_userbuff_inplace<__nv_bfloat16, true>(handler, offset, elements, comm, stream);
}
else
{
allreduce2_userbuff_inplace<__nv_bfloat16, false>(handler, offset, elements, comm, stream);
}
break;
}
#endif
default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_impl");
}
@ -1458,14 +1511,36 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_impl(int const handler, size_t con
switch (dataType)
{
case nvinfer1::DataType::kHALF:
return allreduce2_userbuff_inplace_rmsnorm_quant<half>(handler, offset, out_handler, out_offset, elements,
hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
{
if (kDISABLE_FP32_ACCUMULATION)
{
return allreduce2_userbuff_inplace_rmsnorm_quant<half, true>(handler, offset, out_handler, out_offset,
elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
}
else
{
return allreduce2_userbuff_inplace_rmsnorm_quant<half, false>(handler, offset, out_handler, out_offset,
elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
}
break;
}
#ifdef ENABLE_BF16
case nvinfer1::DataType::kBF16:
return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16>(handler, offset, out_handler, out_offset,
elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
{
if (kDISABLE_FP32_ACCUMULATION)
{
return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16, true>(handler, offset, out_handler,
out_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm,
stream);
}
else
{
return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16, false>(handler, offset, out_handler,
out_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm,
stream);
}
break;
}
#endif
default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_rmsnorm_quant_impl");
}
@ -1479,16 +1554,36 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_impl(int const handler, size_t
switch (dataType)
{
case nvinfer1::DataType::kHALF:
return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<half>(handler, offset, out_handler, out_offset,
scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
residual_out, comm, stream);
if (kDISABLE_FP32_ACCUMULATION)
{
return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<half, true>(handler, offset, out_handler, out_offset,
scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
residual_out, comm, stream);
}
else
{
return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<half, false>(handler, offset, out_handler, out_offset,
scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
residual_out, comm, stream);
}
break;
#ifdef ENABLE_BF16
case nvinfer1::DataType::kBF16:
return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16>(handler, offset, out_handler, out_offset,
scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
residual_out, comm, stream);
{
if (kDISABLE_FP32_ACCUMULATION)
{
return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16, true>(handler, offset, out_handler,
out_offset, scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor,
residual_in, residual_out, comm, stream);
}
else
{
return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16, false>(handler, offset, out_handler,
out_offset, scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor,
residual_in, residual_out, comm, stream);
}
break;
}
#endif
default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_rmsnorm_quant_impl");
}

View File

@ -656,4 +656,7 @@ __inline__ __device__ T blockReduceSumV2(T* val)
warpReduceSumV2<T, NUM>(val);
return (T) 0.0f;
}
static bool const kDISABLE_FP32_ACCUMULATION = getenv("TRTLLM_UB_AR_DISABLE_FP32_ACCUMULATION") != nullptr;
} // namespace tensorrt_llm::runtime::ub

View File

@ -96,18 +96,7 @@ inline bool is_supported(int arch, KernelType kernel_type)
SUPPORT(KernelType::FP16Int8PerChannel);
SUPPORT(KernelType::FP16Int4PerChannel);
}
else if (arch >= 80 && arch < 90)
{
SUPPORT(KernelType::FP16Int8Groupwise);
SUPPORT(KernelType::BF16Int8Groupwise);
SUPPORT(KernelType::FP16Int4Groupwise);
SUPPORT(KernelType::BF16Int4Groupwise);
SUPPORT(KernelType::FP16Int8PerChannel);
SUPPORT(KernelType::BF16Int8PerChannel);
SUPPORT(KernelType::FP16Int4PerChannel);
SUPPORT(KernelType::BF16Int4PerChannel);
}
else if (arch >= 90 && arch != 120)
else if (arch >= 80)
{
SUPPORT(KernelType::FP16Int8Groupwise);
SUPPORT(KernelType::BF16Int8Groupwise);

View File

@ -64,7 +64,7 @@ void initBindings(pybind11::module_& m)
py::classh<GenLlmReq>(m, "GenericLlmRequest")
.def("validate", &GenLlmReq::validate, py::arg("max_input_len"), py::arg("max_seq_len"),
py::arg("max_draft_len"), py::arg("max_endocer_input_len") = std::nullopt,
py::arg("max_draft_len"), py::arg("vocab_size_padded"), py::arg("max_endocer_input_len") = std::nullopt,
py::arg("enable_kv_cache_reuse") = false)
.def("set_exclude_input_from_output", &GenLlmReq::setExcludeInputFromOutput, py::arg("exclude"))
.def("get_num_tokens", &GenLlmReq::getNumTokens, py::arg("beam"))

View File

@ -96,7 +96,10 @@ def build_engines(model_cache: typing.Optional[str] = None,
world_size: int = 1,
clean: Optional[bool] = False):
for model_name in ["chatglm-6b", "chatglm2-6b", "chatglm3-6b", "glm-10b"]:
for model_name in [
"chatglm-6b", "chatglm2-6b", "chatglm3-6b", "glm-10b", "glm-4-9b",
"chatglm3-6b-32k"
]:
is_chatglm_6b_or_glm_10b = model_name in ["chatglm-6b", "glm-10b"]
if model_cache and (Path(model_cache) / model_name).is_dir():
model_cache_dir = Path(model_cache) / model_name
@ -136,20 +139,12 @@ def build_engines(model_cache: typing.Optional[str] = None,
if ckpt_dir.is_dir():
shutil.rmtree(ckpt_dir, ignore_errors=True)
# Fix HF error for ChatGLM-6B / GLM-4-9B / ChatGLM2-6B, hope to remove this in the future
if model_name == "chatglm-6b":
# Fix HF error for ChatGLM-6B / GLM-4-9B / ChatGLM2-6B / ChatGLM3-6B-32K, hope to remove this in the future
if model_name in [
"chatglm-6b", "glm-4-9b", "chatglm2-6b", "chatglm3-6b-32k"
]:
shutil.copy(
chatglm_example_dir / "chatglm-6b/tokenization_chatglm.py",
hf_dir,
)
if model_name == "glm-4-9b":
shutil.copy(
chatglm_example_dir / "glm-4-9b/tokenization_chatglm.py",
hf_dir,
)
if model_name == "chatglm2-6b":
shutil.copy(
chatglm_example_dir / "chatglm2-6b/tokenization_chatglm.py",
chatglm_example_dir / f"{model_name}/tokenization_chatglm.py",
hf_dir,
)

View File

@ -530,12 +530,6 @@ def prepare_multi_gpu_model_tests(python_exe: str,
resources_dir=resources_dir,
model_cache_arg=model_cache_arg)
prepare_model_tests(model_name="chatglm",
python_exe=python_exe,
root_dir=root_dir,
resources_dir=resources_dir,
model_cache_arg=model_cache_arg)
def prepare_model_tests(model_name: str,
python_exe: str,

View File

@ -1,9 +1,9 @@
# Multi-stage Dockerfile
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
ARG BASE_TAG=25.01-py3
ARG BASE_TAG=25.03-py3
ARG DEVEL_IMAGE=devel
FROM ${BASE_IMAGE}:${BASE_TAG} as base
FROM ${BASE_IMAGE}:${BASE_TAG} AS base
# https://www.gnu.org/software/bash/manual/html_node/Bash-Startup-Files.html
# The default values come from `nvcr.io/nvidia/pytorch`
@ -14,7 +14,10 @@ ENV GITHUB_MIRROR=$GITHUB_MIRROR
RUN echo "Using GitHub mirror: $GITHUB_MIRROR"
SHELL ["/bin/bash", "-c"]
FROM base as devel
# Clean up the pip constraint file from the base NGC PyTorch image.
RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true
FROM base AS devel
ARG PYTHON_VERSION="3.12.3"
RUN echo "Using Python version: $PYTHON_VERSION"
@ -59,7 +62,7 @@ ARG TORCH_INSTALL_TYPE="skip"
COPY docker/common/install_pytorch.sh install_pytorch.sh
RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
FROM ${DEVEL_IMAGE} as wheel
FROM ${DEVEL_IMAGE} AS wheel
WORKDIR /src/tensorrt_llm
COPY benchmarks benchmarks
COPY cpp cpp
@ -77,7 +80,7 @@ ARG BUILD_WHEEL_ARGS="--clean --trt_root /usr/local/tensorrt --python_bindings -
RUN --mount=type=cache,target=/root/.cache/pip --mount=type=cache,target=/root/.cache/ccache \
python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS}
FROM ${DEVEL_IMAGE} as release
FROM ${DEVEL_IMAGE} AS release
# Create a cache directory for pip
RUN mkdir -p /root/.cache/pip

View File

@ -152,16 +152,16 @@ jenkins-aarch64_%: STAGE = devel
jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell grep 'LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
jenkins-rockylinux8_%: STAGE = devel
jenkins-rockylinux8_%: BASE_IMAGE = nvidia/cuda
jenkins-rockylinux8_%: BASE_TAG = 12.6.3-devel-rockylinux8
jenkins-rockylinux8_%: BASE_TAG = 12.8.1-devel-rockylinux8
rockylinux8_%: STAGE = devel
rockylinux8_%: BASE_IMAGE = nvidia/cuda
rockylinux8_%: BASE_TAG = 12.6.3-devel-rockylinux8
rockylinux8_%: BASE_TAG = 12.8.1-devel-rockylinux8
# For x86_64 and aarch64
ubuntu22_%: STAGE = devel
ubuntu22_%: BASE_IMAGE = nvidia/cuda
ubuntu22_%: BASE_TAG = 12.6.3-devel-ubuntu22.04
ubuntu22_%: BASE_TAG = 12.8.1-devel-ubuntu22.04
trtllm_%: STAGE = release
trtllm_%: PUSH_TO_STAGING := 0

View File

@ -5,7 +5,7 @@ set -ex
# This script is used for reinstalling CUDA on Rocky Linux 8 with the run file.
# CUDA version is usually aligned with the latest NGC CUDA image tag.
# Only use when public CUDA image is not ready.
CUDA_VER="12.8.0_570.86.10"
CUDA_VER="12.8.1_570.124.06"
CUDA_VER_SHORT="${CUDA_VER%_*}"
NVCC_VERSION_OUTPUT=$(nvcc --version)

View File

@ -4,9 +4,8 @@ set -ex
# Use latest stable version from https://pypi.org/project/torch/#history
# and closest to the version specified in
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-01.html#rel-25-01
# PyTorch v2.5.1 has a fix for https://github.com/pytorch/pytorch/issues/138324.
TORCH_VERSION="2.5.1"
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html#rel-25-03
TORCH_VERSION="2.6.0"
SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
prepare_environment() {

View File

@ -2,20 +2,20 @@
set -ex
TRT_VER="10.8.0.43"
TRT_VER="10.9.0.34"
# Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-01.html#rel-25-01
CUDA_VER="12.8" # 12.8.0
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html#rel-25-03
CUDA_VER="12.8" # 12.8.1
# Keep the installation for cuDNN if users want to install PyTorch with source codes.
# PyTorch 2.x can compile with cuDNN v9.
CUDNN_VER="9.7.0.66-1"
CUDNN_VER="9.8.0.87-1"
NCCL_VER="2.25.1-1+cuda12.8"
CUBLAS_VER="12.8.3.14-1"
CUBLAS_VER="12.8.4.1-1"
# Align with the pre-installed CUDA / NVCC / NVRTC versions from
# https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
NVRTC_VER="12.8.61-1"
CUDA_RUNTIME="12.8.57-1"
CUDA_DRIVER_VERSION="570.86.10-1.el8"
NVRTC_VER="12.8.93-1"
CUDA_RUNTIME="12.8.90-1"
CUDA_DRIVER_VERSION="570.124.06-1.el8"
for i in "$@"; do
case $i in
@ -116,7 +116,7 @@ install_tensorrt() {
if [ -z "$ARCH" ];then ARCH=$(uname -m);fi
if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi
if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
RELEASE_URL_TRT="https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/tars/TensorRT-${TRT_VER}.Linux.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz"
RELEASE_URL_TRT="https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.9.0/tars/TensorRT-${TRT_VER}.Linux.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz"
fi
wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
tar -xf /tmp/TensorRT.tar -C /usr/local/

View File

@ -1,7 +1,7 @@
sphinx>=7.0
sphinx-argparse
sphinx-click
sphinx-rtd-theme
nvidia-sphinx-theme
myst_parser
breathe
pygit2

View File

@ -67,7 +67,7 @@ source_suffix = {
'.md': 'markdown',
}
html_theme = 'sphinx_rtd_theme'
html_theme = 'nvidia_sphinx_theme'
html_static_path = ['_static']
# ------------------------ C++ Doc related --------------------------

View File

@ -27,8 +27,6 @@ Welcome to TensorRT-LLM's Documentation!
installation/linux.md
installation/build-from-source-linux.md
installation/windows.md
installation/build-from-source-windows.md
installation/grace-hopper.md

View File

@ -1,199 +0,0 @@
(build-from-source-windows)=
# Building from Source Code on Windows
```{note}
This section is for advanced users. Skip this section if you plan to use the pre-built TensorRT-LLM release wheel.
```
## Prerequisites
1. Install prerequisites listed in our [Installing on Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html) document.
2. Install [CMake](https://cmake.org/download/), version 3.27.7 is recommended, and select the option to add it to the system path.
3. Download and install [Visual Studio 2022](https://visualstudio.microsoft.com/).
4. Download and unzip [TensorRT 10.8.0.43](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/zip/TensorRT-10.8.0.43.Windows.win10.cuda-12.8.zip).
## Building a TensorRT-LLM Docker Image
### Docker Desktop
1. Install [Docker Desktop on Windows](https://docs.docker.com/desktop/install/windows-install/).
2. Set the following configurations:
1. Right-click the Docker icon in the Windows system tray (bottom right of your taskbar) and select **Switch to Windows containers...**.
2. In the Docker Desktop settings on the **General** tab, uncheck **Use the WSL 2 based image**.
3. On the **Docker Engine** tab, set your configuration file to:
```
{
"experimental": true
}
```
```{note}
After building, copy the files out of your container. `docker cp` is not supported on Windows for Hyper-V based images. Unless you are using WSL 2 based images, mount a folder, for example, `trt-llm-build`, to your container when you run it for moving files between the container and host system.
```
### Acquire an Image
The Docker container will be hosted for public download in a future release. At this time, it must be built manually. From the `TensorRT-LLM\windows\` folder, run the build command:
```bash
docker build -f .\docker\Dockerfile -t tensorrt-llm-windows-build:latest .
```
And your image is now ready for use.
### Run the Container
Run the container in interactive mode with your build folder mounted. Specify a memory limit with the `-m` flag. By default, the limit is 2 GB, which is not sufficient to build TensorRT-LLM.
```bash
docker run -it -m 12g -v .\trt-llm-build:C:\workspace\trt-llm-build tensorrt-llm-windows-build:latest
```
### Build and Extract Files
1. Clone and setup the TensorRT-LLM repository within the container.
```bash
git clone https://github.com/NVIDIA/TensorRT-LLM.git
cd TensorRT-LLM
git submodule update --init --recursive
```
2. Build TensorRT-LLM. This command generates `build\tensorrt_llm-*.whl`.
```bash
python .\scripts\build_wheel.py -a "89-real" --trt_root C:\workspace\TensorRT-10.8.0.43\
```
3. Copy or move `build\tensorrt_llm-*.whl` into your mounted folder so it can be accessed on your host machine. If you intend to use the C++ runtime, you'll also need to gather various DLLs from the build into your mounted folder. For more information, refer to [C++ Runtime Usage](#c-runtime-usage).
## Building TensorRT-LLM on Bare Metal
**Prerequisites**
1. Install all prerequisites (`git`, `python`, `CUDA`) listed in our [Installing on Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html) document.
2. Install Nsight NVTX. TensorRT-LLM on Windows currently depends on NVTX assets that do not come packaged with the CUDA 12.8.0 installer. To install these assets, download the [CUDA 11.8 Toolkit](https://developer.nvidia.com/cuda-11-8-0-download-archive?target_os=Windows&target_arch=x86_64).
1. During installation, select **Advanced installation**.
2. Nsight NVTX is located in the CUDA drop-down.
3. Deselect all packages, and select **Nsight NVTX**.
3. Install the dependencies one of two ways:
1. Run the `setup_build_env.ps1` script, which installs CMake, Microsoft Visual Studio Build Tools, and TensorRT automatically with default settings.
1. Run PowerShell as Administrator to use the script.
```bash
./setup_build_env.ps1 -TRTPath <TRT-containing-folder> [-skipCMake] [-skipVSBuildTools] [-skipTRT]
```
2. Close and reopen PowerShell after running the script so that `Path` changes take effect.
3. Supply a directory that already exists to contain TensorRT to `-TRTPath`, for example, `-TRTPath ~/inference` may be valid, but `-TRTPath ~/inference/TensorRT` will not be valid if `TensorRT` does not exist. `-TRTPath` isn't required if `-skipTRT` is supplied.
2. Install the dependencies one at a time.
1. Install [CMake](https://cmake.org/download/), version 3.27.7 is recommended, and select the option to add it to the system path.
2. Download and install [Visual Studio 2022](https://visualstudio.microsoft.com/). When prompted to select more Workloads, check **Desktop development with C++**.
3. Download and unzip [TensorRT 10.8.0.43](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/zip/TensorRT-10.8.0.43.Windows.win10.cuda-12.8.zip). Move the folder to a location you can reference later, such as `%USERPROFILE%\inference\TensorRT`.
1. Add the libraries for TensorRT to your system's `Path` environment variable. Your `Path` should include a line like this:
```bash
%USERPROFILE%\inference\TensorRT\lib
```
2. Close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.
3. Remove existing `tensorrt` wheels first by executing
```bash
pip uninstall -y tensorrt tensorrt_libs tensorrt_bindings
pip uninstall -y nvidia-cublas-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12
```
4. Install the TensorRT core libraries, run PowerShell, and use `pip` to install the Python wheel.
```bash
pip install %USERPROFILE%\inference\TensorRT\python\tensorrt-*.whl
```
5. Verify that your TensorRT installation is working properly.
```bash
python -c "import tensorrt as trt; print(trt.__version__)"
```
**Steps**
1. Launch a 64-bit Developer PowerShell. From your usual PowerShell terminal, run one of the following two commands.
1. If you installed Visual Studio Build Tools (that is, used the `setup_build_env.ps1` script):
```bash
& 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Launch-VsDevShell.ps1' -Arch amd64
```
2. If you installed Visual Studio Community (e.g. via manual GUI setup):
```bash
& 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Launch-VsDevShell.ps1' -Arch amd64
```
2. In PowerShell, from the `TensorRT-LLM` root folder, run:
```bash
python .\scripts\build_wheel.py -a "89-real" --trt_root <path_to_trt_root>
```
The `-a` flag specifies the device architecture. `"89-real"` supports GeForce 40-series cards.
The flag `-D "ENABLE_MULTI_DEVICE=0"`, while not specified here, is implied on Windows. Multi-device inference is supported on Linux, but not on Windows.
This command generates `build\tensorrt_llm-*.whl`.
(c-runtime-usage)=
## Linking with the TensorRT-LLM C++ Runtime
```{note}
This section is for advanced users. Skip this section if you do not intend to use the TensorRT-LLM C++ runtime directly. You must build from source to use the C++ runtime.
```
Building from source creates libraries that can be used if you wish to directly link against the C++ runtime for TensorRT-LLM. These libraries are also required if you wish to run C++ unit tests and some benchmarks.
Building from source produces the following library files.
- `tensorrt_llm` libraries located in `cpp\build\tensorrt_llm`
- `tensorrt_llm.dll` - Shared library
- `tensorrt_llm.exp` - Export file
- `tensorrt_llm.lib` - Stub for linking to `tensorrt_llm.dll`
- Dependency libraries (these get copied to `tensorrt_llm\libs\`)
- `nvinfer_plugin_tensorrt_llm` libraries located in `cpp\build\tensorrt_llm\plugins\`
- `nvinfer_plugin_tensorrt_llm.dll`
- `nvinfer_plugin_tensorrt_llm.exp`
- `nvinfer_plugin_tensorrt_llm.lib`
- `th_common` libraries located in `cpp\build\tensorrt_llm\thop\`
- `th_common.dll`
- `th_common.exp`
- `th_common.lib`
The locations of the DLLs, in addition to some `torch` DLLs and `TensorRT` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these:
```bash
%USERPROFILE%\inference\TensorRT\lib
%USERPROFILE%\inference\TensorRT-LLM\cpp\build\tensorrt_llm
%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\tensorrt_llm\libs
%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\torch\lib
```
Your `Path` additions may differ, particularly if you used the Docker method and copied all the relevant DLLs into a single folder.
Again, close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.

View File

@ -5,7 +5,7 @@
1. Install TensorRT-LLM (tested on Ubuntu 24.04).
```bash
pip3 install torch==2.5.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
pip3 install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm --extra-index-url https://pypi.nvidia.com
```

View File

@ -1,81 +0,0 @@
(windows)=
# Installing on Windows
```{note}
The Windows release of TensorRT-LLM is currently in beta.
We recommend checking out the [v0.17.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.17.0) for the most stable experience.
```
```{note}
TensorRT-LLM on Windows only supports single-GPU execution.
```
**Prerequisites**
1. Clone this repository using [Git for Windows](https://git-scm.com/download/win).
2. Install the dependencies one of two ways:
1. Install all dependencies together.
1. Run the provided PowerShell script `setup_env.ps1` located under the `/windows/` folder which installs Python and CUDA 12.8.0 automatically with default settings. Run PowerShell as Administrator to use the script.
```bash
./setup_env.ps1 [-skipCUDA] [-skipPython]
```
2. Close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path` modified by the `setup_env.ps1` script above.
2. Install the dependencies one at a time.
1. Install [Python 3.10](https://www.python.org/ftp/python/3.10.11/python-3.10.11-amd64.exe).
1. Select **Add python.exe to PATH** at the start of the installation. The installation may only add the `python` command, but not the `python3` command.
2. Navigate to the installation path `%USERPROFILE%\AppData\Local\Programs\Python\Python310` (`AppData` is a hidden folder) and copy `python.exe` to `python3.exe`.
2. Install [CUDA 12.8.0 Toolkit](https://developer.nvidia.com/cuda-12-8-0-download-archive?target_os=Windows&target_arch=x86_64). Use the Express Installation option. Installation may require a restart.
3. If using conda environment, run the following command before installing TensorRT-LLM.
```bash
conda install -c conda-forge pyarrow
```
**Steps**
1. Install TensorRT-LLM.
If you have an existing TensorRT installation (from older versions of `tensorrt_llm`), please execute
```bash
pip uninstall -y tensorrt tensorrt_libs tensorrt_bindings
pip uninstall -y nvidia-cublas-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12
```
before installing TensorRT-LLM with the following command.
```bash
pip install tensorrt_llm==0.17.0.post1 --extra-index-url https://download.pytorch.org/whl/ --extra-index-url https://pypi.nvidia.com
```
Run the following command to verify that your TensorRT-LLM installation is working properly.
```bash
python -c "import tensorrt_llm; print(tensorrt_llm._utils.trt_version())"
```
2. Build the model.
3. Deploy the model.
**Known Issue**
1. `OSError: exception: access violation reading 0x0000000000000000` during `import tensorrt_llm` or `trtllm-build`.
This may be caused by an outdated Microsoft Visual C++ Redistributable Version. Please install
[the latest MSVC](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170#latest-microsoft-visual-c-redistributable-version)
and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSVC DLL).
2. OSError: [WinError 126] The specified module could not be found. Error loading “...\Lib\site-packages\torch\lib\fbgemm.dll” or one of its dependencies.
Installing the latest [Build Tools for Visual Studio 2022] (https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022) will resolve the issue.

View File

@ -35,7 +35,7 @@ Certain limitations might apply. Refer to the {ref}`support-matrix` for more inf
### Native Windows Support
Application developers and AI enthusiasts can now benefit from accelerated LLMs running locally on PCs and Workstations powered by NVIDIA RTX and NVIDIA GeForce RTX GPUs. Refer to the {ref}`windows` section for more information.
Windows platform support is deprecated as of v0.18.0. All Windows-related code and functionality will be completely removed in future releases.
## What Can You Do With TensorRT-LLM?

View File

@ -90,7 +90,7 @@ In addition, older architectures can have limitations for newer software release
* -
- Hardware Compatibility
* - Operating System
- TensorRT-LLM requires Linux x86_64, Linux aarch64 or Windows.
- TensorRT-LLM requires Linux x86_64 or Linux aarch64.
* - GPU Model Architectures
-
- [NVIDIA Blackwell Architecture](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/)
@ -112,9 +112,9 @@ The following table shows the supported software for TensorRT-LLM.
* -
- Software Compatibility
* - Container
- [25.01](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
- [25.03](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
* - TensorRT
- [10.8](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
- [10.9](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
* - Precision
-
- Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4

View File

@ -5,6 +5,23 @@
All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/).
## TensorRT-LLM Release 0.18.0
### Key Features and Enhancements
- **Features that were previously available in the 0.18.0.dev pre-releases are not included in this release**.
- [BREAKING CHANGE] Windows platform support is deprecated as of v0.18.0. All Windows-related code and functionality will be completely removed in future releases.
### Known Issues
- The PyTorch workflow on SBSA is incompatible with bare metal environments like Ubuntu 24.04. Please use the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) for optimal support on SBSA platforms.
### Infrastructure Changes
- The base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:25.03-py3`.
- The base Docker image for TensorRT-LLM Backend is updated to `nvcr.io/nvidia/tritonserver:25.03-py3`.
- The dependent TensorRT version is updated to 10.9.
- The dependent CUDA version is updated to 12.8.1.
- The dependent NVIDIA ModelOpt version is updated to 0.25 for Linux platform.
## TensorRT-LLM Release 0.17.0
### Key Features and Enhancements

View File

@ -33,7 +33,7 @@ Or you can try the following commands to get a quantized model by yourself:
```bash
git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
cd TensorRT-Model-Optimizer/llm_ptq
cd TensorRT-Model-Optimizer/examples/llm_ptq
scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
```

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.15.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -0,0 +1,313 @@
import json
import os
from typing import Dict, List, Optional, Union
from sentencepiece import SentencePieceProcessor
from transformers import PreTrainedTokenizer
from transformers.tokenization_utils_base import BatchEncoding, EncodedInput
from transformers.utils import PaddingStrategy
class SPTokenizer:
def __init__(self, model_path: str):
# reload tokenizer
assert os.path.isfile(model_path), model_path
self.sp_model = SentencePieceProcessor(model_file=model_path)
# BOS / EOS token IDs
self.n_words: int = self.sp_model.vocab_size()
self.bos_id: int = self.sp_model.bos_id()
self.eos_id: int = self.sp_model.eos_id()
self.pad_id: int = self.sp_model.unk_id()
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
special_tokens = [
"[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>",
"<|user|>", "<|assistant|>", "<|observation|>"
]
self.special_tokens = {}
self.index_special_tokens = {}
for token in special_tokens:
self.special_tokens[token] = self.n_words
self.index_special_tokens[self.n_words] = token
self.n_words += 1
def tokenize(self, s: str):
return self.sp_model.EncodeAsPieces(s)
def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
assert type(s) is str
t = self.sp_model.encode(s)
if bos:
t = [self.bos_id] + t
if eos:
t = t + [self.eos_id]
return t
def decode(self, t: List[int]) -> str:
text, buffer = "", []
for token in t:
if token in self.index_special_tokens:
if buffer:
text += self.sp_model.decode(buffer)
buffer = []
text += self.index_special_tokens[token]
else:
buffer.append(token)
if buffer:
text += self.sp_model.decode(buffer)
return text
def decode_tokens(self, tokens: List[str]) -> str:
text = self.sp_model.DecodePieces(tokens)
return text
def convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
if token in self.special_tokens:
return self.special_tokens[token]
return self.sp_model.PieceToId(token)
def convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
if index in self.index_special_tokens:
return self.index_special_tokens[index]
if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
return ""
return self.sp_model.IdToPiece(index)
class ChatGLMTokenizer(PreTrainedTokenizer):
vocab_files_names = {"vocab_file": "tokenizer.model"}
model_input_names = ["input_ids", "attention_mask", "position_ids"]
def __init__(self,
vocab_file,
padding_side="left",
clean_up_tokenization_spaces=False,
**kwargs):
self.name = "GLMTokenizer"
self.vocab_file = vocab_file
self.tokenizer = SPTokenizer(vocab_file)
self.special_tokens = {
"<bos>": self.tokenizer.bos_id,
"<eos>": self.tokenizer.eos_id,
"<pad>": self.tokenizer.pad_id
}
super().__init__(
padding_side=padding_side,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs)
def get_command(self, token):
if token in self.special_tokens:
return self.special_tokens[token]
assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
return self.tokenizer.special_tokens[token]
@property
def unk_token(self) -> str:
return "<unk>"
@property
def pad_token(self) -> str:
return "<unk>"
@property
def pad_token_id(self):
return self.get_command("<pad>")
@property
def eos_token(self) -> str:
return "</s>"
@property
def eos_token_id(self):
return self.get_command("<eos>")
@property
def vocab_size(self):
return self.tokenizer.n_words
def get_vocab(self):
""" Returns vocab as a dict """
vocab = {
self._convert_id_to_token(i): i
for i in range(self.vocab_size)
}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text, **kwargs):
return self.tokenizer.tokenize(text)
def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
return self.tokenizer.convert_token_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.tokenizer.convert_id_to_token(index)
def convert_tokens_to_string(self, tokens: List[str]) -> str:
return self.tokenizer.decode_tokens(tokens)
def save_vocabulary(self, save_directory, filename_prefix=None):
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
filename_prefix (`str`, *optional*):
An optional prefix to add to the named of the saved files.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if os.path.isdir(save_directory):
vocab_file = os.path.join(save_directory,
self.vocab_files_names["vocab_file"])
else:
vocab_file = save_directory
with open(self.vocab_file, 'rb') as fin:
proto_bytes = fin.read()
with open(vocab_file, "wb") as writer:
writer.write(proto_bytes)
return (vocab_file, )
def get_prefix_tokens(self):
prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
return prefix_tokens
def build_single_message(self, role, metadata, message):
assert role in ["system", "user", "assistant", "observation"], role
role_tokens = [self.get_command(f"<|{role}|>")
] + self.tokenizer.encode(f"{metadata}\n")
message_tokens = self.tokenizer.encode(message)
tokens = role_tokens + message_tokens
return tokens
def build_chat_input(self, query, history=None, role="user"):
if history is None:
history = []
input_ids = []
for item in history:
content = item["content"]
if item["role"] == "system" and "tools" in item:
content = content + "\n" + json.dumps(
item["tools"], indent=4, ensure_ascii=False)
input_ids.extend(
self.build_single_message(item["role"],
item.get("metadata", ""), content))
input_ids.extend(self.build_single_message(role, "", query))
input_ids.extend([self.get_command("<|assistant|>")])
return self.batch_encode_plus([input_ids],
return_tensors="pt",
is_split_into_words=True)
def build_inputs_with_special_tokens(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
prefix_tokens = self.get_prefix_tokens()
token_ids_0 = prefix_tokens + token_ids_0
if token_ids_1 is not None:
token_ids_0 = token_ids_0 + token_ids_1 + [
self.get_command("<eos>")
]
return token_ids_0
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
padding_side: str = "left", # wili, fix for new transformers
) -> dict:
"""
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
Args:
encoded_inputs:
Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
max_length: maximum length of the returned list and optionally padding length (see below).
Will truncate by taking into account the special tokens.
padding_strategy: PaddingStrategy to use for padding.
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
The tokenizer padding sides are defined in self.padding_side:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
# Load from model defaults
assert self.padding_side == "left"
required_input = encoded_inputs[self.model_input_names[0]]
seq_length = len(required_input)
if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)
if max_length is not None and pad_to_multiple_of is not None and (
max_length % pad_to_multiple_of != 0):
max_length = (
(max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
required_input) != max_length
# Initialize attention mask if not present.
if "attention_mask" not in encoded_inputs:
encoded_inputs["attention_mask"] = [1] * seq_length
if "position_ids" not in encoded_inputs:
encoded_inputs["position_ids"] = list(range(seq_length))
if needs_to_be_padded:
difference = max_length - len(required_input)
if "attention_mask" in encoded_inputs:
encoded_inputs["attention_mask"] = [
0
] * difference + encoded_inputs["attention_mask"]
if "position_ids" in encoded_inputs:
encoded_inputs["position_ids"] = [
0
] * difference + encoded_inputs["position_ids"]
encoded_inputs[self.model_input_names[
0]] = [self.pad_token_id] * difference + required_input
return encoded_inputs

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
evaluate~=0.4.1
protobuf

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece>=0.1.99

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
rouge_score~=0.1.2
SentencePiece~=0.1.99

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
transformers>=4.31.0
datasets~=2.14.5
evaluate~=0.4.1

View File

@ -2,7 +2,7 @@
# WAR the new posting of "nvidia-cudnn-cu12~=9.0".
# "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
flax~=0.8.0
# jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
jax~=0.4.19; platform_system == "Windows"

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
rouge_score~=0.1.2
evaluate~=0.4.1

View File

@ -1,5 +1,5 @@
-f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets==2.14.5
rouge_score~=0.1.2
sentencepiece>=0.1.99

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
transformers>=4.43.0
datasets==2.14.6
evaluate~=0.4.1

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece>=0.1.99

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
transformers>=4.39.0
datasets~=2.14.5
evaluate

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece>=0.1.99

View File

@ -1,3 +1,3 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
transformers==4.38.2
accelerate==0.25.0

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
nemo-toolkit[all]==2.0.0rc1
megatron-core==0.8.0
datasets~=2.14.5

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece~=0.1.99

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets>=2.14.4
nemo-toolkit[all]==2.0.0rc1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.16.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.16.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
tensorrt_llm==0.17.0.post1
git+https://github.com/google-deepmind/recurrentgemma.git
tensorrt_llm==0.18.0
git+https://github.com/google-deepmind/recurrentgemma.git@8a32e365
flax>=0.8.2
jax~=0.4.23
orbax-checkpoint==0.5.7

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece>=0.1.99

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets~=2.16.1
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,4 +1,4 @@
tensorrt_llm==0.17.0.post1
tensorrt_llm==0.18.0
tiktoken
datasets
kaldialign

Some files were not shown because too many files have changed in this diff Show More