mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
TensorRT-LLM v0.18 release (#3231)
This commit is contained in:
parent
258c7540c0
commit
3c0462002c
@ -4,7 +4,7 @@ repos:
|
||||
hooks:
|
||||
- id: isort
|
||||
- repo: https://github.com/Lucas-C/pre-commit-hooks.git
|
||||
rev: v1.1.13
|
||||
rev: v1.5.5
|
||||
hooks:
|
||||
- id: remove-crlf
|
||||
- repo: https://github.com/google/yapf
|
||||
@ -24,7 +24,7 @@ repos:
|
||||
- id: check-yaml
|
||||
- id: trailing-whitespace
|
||||
- repo: https://github.com/PyCQA/autoflake
|
||||
rev: v1.6.1
|
||||
rev: v2.3.1
|
||||
hooks:
|
||||
- id: autoflake
|
||||
args: ['--in-place', '--remove-all-unused-imports', '--remove-unused-variables']
|
||||
|
||||
@ -7,9 +7,9 @@ TensorRT-LLM
|
||||
[](https://nvidia.github.io/TensorRT-LLM/)
|
||||
[](https://www.python.org/downloads/release/python-3123/)
|
||||
[](https://www.python.org/downloads/release/python-31012/)
|
||||
[](https://developer.nvidia.com/cuda-downloads)
|
||||
[](https://developer.nvidia.com/tensorrt)
|
||||
[](./tensorrt_llm/version.py)
|
||||
[](https://developer.nvidia.com/cuda-downloads)
|
||||
[](https://developer.nvidia.com/tensorrt)
|
||||
[](./tensorrt_llm/version.py)
|
||||
[](./LICENSE)
|
||||
|
||||
[Architecture](./docs/source/architecture/overview.md) | [Performance](./docs/source/performance/perf-overview.md) | [Examples](./examples/) | [Documentation](./docs/source/) | [Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
|
||||
|
||||
@ -94,6 +94,7 @@ private:
|
||||
[[nodiscard]] SizeType32 getMaxSequenceLen() const;
|
||||
[[nodiscard]] SizeType32 getMaxNumSequences() const;
|
||||
[[nodiscard]] SizeType32 getMaxDraftLen() const;
|
||||
[[nodiscard]] SizeType32 getVocabSizePadded() const;
|
||||
|
||||
void validateLlmRequest(
|
||||
LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
|
||||
|
||||
@ -26,8 +26,6 @@
|
||||
namespace tensorrt_llm::batch_manager
|
||||
{
|
||||
|
||||
namespace tle = tensorrt_llm::executor;
|
||||
|
||||
class AllocateKvCache : Algorithm
|
||||
{
|
||||
using BaseKVCacheManager = tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager;
|
||||
|
||||
@ -56,7 +56,7 @@ public:
|
||||
|
||||
struct ExpiringBlockComparator
|
||||
{
|
||||
inline bool operator()(BlockPtr const& a, BlockPtr const& b) const
|
||||
bool operator()(BlockPtr const& a, BlockPtr const& b) const
|
||||
{
|
||||
// If two blocks expire in the same millisecond, their expiration times will be equal. As a fallback, check the
|
||||
// raw pointer values.
|
||||
|
||||
@ -166,6 +166,8 @@ class KVCacheBlock
|
||||
public:
|
||||
using IdType = std::int32_t;
|
||||
|
||||
static constexpr IdType kCachedBlocksRootId = -1;
|
||||
|
||||
explicit KVCacheBlock(IdType blockId, kernels::KVCacheIndex blockIdx);
|
||||
|
||||
void startScheduling();
|
||||
@ -379,6 +381,16 @@ public:
|
||||
return mKvCacheRetentionConfig.getDecodeDurationMs();
|
||||
}
|
||||
|
||||
[[nodiscard]] bool getContextRequiresCyclicKvCache() const
|
||||
{
|
||||
return mContextRequiresCyclicKvCache;
|
||||
}
|
||||
|
||||
void setContextRequiresCyclicKvCache(bool contextRequiresCyclicKvCache)
|
||||
{
|
||||
mContextRequiresCyclicKvCache = contextRequiresCyclicKvCache;
|
||||
}
|
||||
|
||||
private:
|
||||
// Request id of the sequence
|
||||
LlmRequest::RequestIdType mRequestId;
|
||||
@ -392,6 +404,9 @@ private:
|
||||
runtime::ITensor::SharedPtr mCacheBlockIndices;
|
||||
// The retention priority to assign to decode blocks
|
||||
executor::KvCacheRetentionConfig mKvCacheRetentionConfig;
|
||||
|
||||
// A value indicating whether or not the context is long enough to warrant the use of cyclic kv-cache.
|
||||
bool mContextRequiresCyclicKvCache{false};
|
||||
};
|
||||
|
||||
// attach metadata to a pool pointer
|
||||
@ -443,7 +458,7 @@ public:
|
||||
SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
|
||||
CacheType cacheType = CacheType::kSELF,
|
||||
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
|
||||
std::shared_ptr<KVCacheEventManager> eventManager = nullptr);
|
||||
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false);
|
||||
|
||||
~BlockManager();
|
||||
|
||||
@ -712,6 +727,9 @@ private:
|
||||
SizeType32 mMissedBlocks;
|
||||
std::set<KVCacheBlock::IdType> reusedBlockIds;
|
||||
|
||||
// Whether or not to maintain a hashmap of blocks.
|
||||
bool mEnableHashKey;
|
||||
|
||||
private:
|
||||
friend class KVCacheManager;
|
||||
};
|
||||
@ -818,16 +836,18 @@ public:
|
||||
//! \details These blocks become reusable from next step.
|
||||
virtual void storeContextBlocks(LlmRequest const& llmRequest) = 0;
|
||||
|
||||
virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0;
|
||||
[[nodiscard]] virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0;
|
||||
|
||||
virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(LlmRequest::RequestIdType requestId) const = 0;
|
||||
[[nodiscard]] virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
|
||||
LlmRequest::RequestIdType requestId) const
|
||||
= 0;
|
||||
|
||||
virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
|
||||
[[nodiscard]] virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
|
||||
std::vector<LlmRequest::RequestIdType> const& requestIds) const
|
||||
= 0;
|
||||
|
||||
virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
|
||||
virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
|
||||
[[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
|
||||
[[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
|
||||
|
||||
virtual void refreshBlocks() = 0;
|
||||
virtual void flushIterationEvents() = 0;
|
||||
@ -846,7 +866,7 @@ public:
|
||||
* 2 * modelConfig.getSizePerHead();
|
||||
}
|
||||
|
||||
[[nodiscard]] static std::tuple<SizeType32, SizeType32> const calculateMaxNumBlocks(KvCacheConfig const& config,
|
||||
[[nodiscard]] static std::tuple<SizeType32, SizeType32> calculateMaxNumBlocks(KvCacheConfig const& config,
|
||||
nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,
|
||||
tensorrt_llm::runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
|
||||
|
||||
@ -924,7 +944,7 @@ public:
|
||||
return mBlockManager.getNumFreeBlocks();
|
||||
}
|
||||
|
||||
[[nodiscard]] virtual SizeType32 getNumPools() const override
|
||||
[[nodiscard]] SizeType32 getNumPools() const override
|
||||
{
|
||||
return mBlockManager.getNumPools();
|
||||
}
|
||||
@ -994,8 +1014,6 @@ public:
|
||||
/// @return The number of blocks
|
||||
[[nodiscard]] SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const override;
|
||||
|
||||
void addContextTokens(LlmRequest::RequestIdType requestId, SizeType32 numTokens);
|
||||
|
||||
/// @brief Increase size for request with requestId. Allocate new KV cache block(s) if needed.
|
||||
void addToken(LlmRequest::RequestIdType requestId) override;
|
||||
|
||||
|
||||
@ -91,7 +91,7 @@ private:
|
||||
|
||||
runtime::ITensor::SharedPtr mPool;
|
||||
runtime::ITensor::SharedPtr mCurrent;
|
||||
const std::vector<SizeType32> mBlockIds;
|
||||
std::vector<SizeType32> const mBlockIds;
|
||||
size_t mIdx;
|
||||
};
|
||||
|
||||
|
||||
@ -490,9 +490,14 @@ public:
|
||||
initialize(req.getInputTokenIds(), req.getOutputConfig().returnLogProbs);
|
||||
}
|
||||
|
||||
void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
|
||||
void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded,
|
||||
std::optional<SizeType32> maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false)
|
||||
{
|
||||
if (mEndId.has_value())
|
||||
{
|
||||
TLLM_CHECK_WITH_INFO(*mEndId >= -1 && *mEndId < vocabSizePadded,
|
||||
"EndId (%d) is not within acceptable range [-1, %d).", *mEndId, vocabSizePadded);
|
||||
}
|
||||
TLLM_CHECK_WITH_INFO(!(maxEncoderInputLen.has_value() && getEncoderInputLen() > maxEncoderInputLen.value()),
|
||||
"Encoder length (%d) exceeds maximum encoder input length (%d).", getEncoderInputLen(),
|
||||
maxEncoderInputLen.value());
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
#include "tensorrt_llm/layers/defaultDecodingParams.h"
|
||||
#include "tensorrt_llm/runtime/common.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:54aeaec28cc8cd7e5f62829fecf5af5be192e906333b108028af951fc6b6346d
|
||||
size 9125406
|
||||
oid sha256:2d361766d0a13d5d88071e546f5d7ca51fef92300fcc7b261337c638746cbff1
|
||||
size 9123884
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:4d5c2aac4ca61d80f8a61d06a7e17fcfb0c9428a3dd89ade705f14cf651acd4b
|
||||
size 9169292
|
||||
oid sha256:e7a942b813cd05c5d21c82a7e5b5988227988668bf960b3a954443998b4e2a2b
|
||||
size 9167324
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:c067e858d968710bfe4575694ff5cabf1fb9d29924f6bd4cee552c4fd485a8ca
|
||||
size 2026192
|
||||
oid sha256:a11179efe519b2b001d6021c7cbea949c81b3618395fa2ce44da9b09d7d35d14
|
||||
size 2029704
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
ca50ae76421863dfebf6080b7f4f6b29 libtensorrt_llm_ucx_wrapper.so
|
||||
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
|
||||
9f9942768fd5b0cf5ed19860ad539dc9 libtensorrt_llm_ucx_wrapper.so
|
||||
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:092faef60d09d008331b2b09453d89563b5fe6c49a6a195051ad51ca80f79277
|
||||
size 8407972
|
||||
oid sha256:e2ec997b71d8b990eecc0034930b24319916ed3615a618f982e1b780d7643bc6
|
||||
size 8408224
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:4d00bce3aa2f555f98cb2213d183a3174bb083cd62179ac65dce24d75bd648eb
|
||||
size 8374854
|
||||
oid sha256:2662dfb4833b41f71f4ef9cfe6a46d6312a04065c2343a899e1476102019a180
|
||||
size 8374456
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ac4ad59907a322e6fbb85b9e88cea587cc4f5e4018017726895f64bd800c8552
|
||||
oid sha256:1a1d23d99e2b91fa11ee3e1fb94179c0731065059eb981da9434a42d98ffa4d8
|
||||
size 15592
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
0ec83a0451530fcf0e3f325cdc185043 libtensorrt_llm_ucx_wrapper.so
|
||||
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
|
||||
e383212a40dca932c7b77bf4544dab80 libtensorrt_llm_ucx_wrapper.so
|
||||
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
|
||||
@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a64b3c16150b34ad6437862eaf95c5c35acdf69facc40af14bc0632a16b7d162
|
||||
size 54093198
|
||||
@ -1,2 +0,0 @@
|
||||
41725f3b40ca44954bb9de6e7dcbfd2c tensorrt_llm_batch_manager_static.lib
|
||||
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:4d333952a574e9056a0f01f788f9c053d92a7a9bc988a335df663697405f5659
|
||||
size 3102572
|
||||
oid sha256:d6ef115e34695dd0bec9df6069dd2e95615f401546ce275b133145fdb7568c6c
|
||||
size 3102764
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:aa6dcdbe9501429192102c1094be664d0ab07199acc4882ab061eb48b699d83c
|
||||
size 3145248
|
||||
oid sha256:3bc68d4aec21a361954dd144084edb050e19390b87d6668f88b9e7f110f717a0
|
||||
size 3145744
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
15c05b1921f3f8cbb7bc1e53f189c661 libtensorrt_llm_executor_static.a
|
||||
b586d90eac1293ea656ff2db8a35cd92 libtensorrt_llm_executor_static.pre_cxx11.a
|
||||
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
|
||||
288c6aa23b993d60d758107188c37d78 libtensorrt_llm_executor_static.a
|
||||
20e46fb4b5b23a0f27eb3f8dd7d4d3bf libtensorrt_llm_executor_static.pre_cxx11.a
|
||||
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a4b144867eb79d269db173338ccae8dc6c68a8374a49fe17a555034ac433f46f
|
||||
size 3457528
|
||||
oid sha256:5b819d5cf3f7d9bd1ee69427db4b7ce3eb9c17f9e2cfa319540dad51ed6648e7
|
||||
size 3457520
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a7b5f70dda7b8042d09098859a4bd1851159d3c750da97f3bfd55bf713c7a1cc
|
||||
size 3447846
|
||||
oid sha256:9f123e25ff2e046afff092b082a60b461f3f13853630857bd166e5e8a084e1ee
|
||||
size 3448406
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
d573456348a96fd7a97aa832f1113986 libtensorrt_llm_executor_static.a
|
||||
b3ba7776e3b5bb6e750e8412fc3b5c60 libtensorrt_llm_executor_static.pre_cxx11.a
|
||||
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
|
||||
eebaf66c6ac42645a9bf12a0b013ac4b libtensorrt_llm_executor_static.a
|
||||
1fc4cc62abfb31ad51f6ae3620641c04 libtensorrt_llm_executor_static.pre_cxx11.a
|
||||
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
|
||||
@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:3722133c17bdad38c3bbd3a5caa4eafbe17805d3a7fa784f10c313902020a13b
|
||||
size 26350954
|
||||
@ -1,2 +0,0 @@
|
||||
5beaadd32fc3dd25770746016b293229 tensorrt_llm_executor_static.lib
|
||||
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:41cb6788cd975f0f2ef1de0bdff9d3178280a3c2683abb11929e0d5ccecc9d76
|
||||
size 126752312
|
||||
oid sha256:f82db62eaeeb8a02d44b4cad97ae050cc447eea8a3e48b03f56f6040d1aaccc8
|
||||
size 126824176
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
200a2d19469277b9906a00f7da83fd04 libtensorrt_llm_nvrtc_wrapper.so
|
||||
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
|
||||
f3143205203b038b9dca6dd32cf02f59 libtensorrt_llm_nvrtc_wrapper.so
|
||||
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:738668482149971eaa849b863360e21fe66781c5eeaadab8263c83a3b67637dc
|
||||
size 133824576
|
||||
oid sha256:bdb100ae1f96025c5dd7e977cbae05005c2b2b3f36b902edc3a4f24d85ec3731
|
||||
size 133867944
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
20761e50ba44b91a7a2d1f8d1c5c780b libtensorrt_llm_nvrtc_wrapper.so
|
||||
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
|
||||
770ca93818f3f04837a67353e3f71fbc libtensorrt_llm_nvrtc_wrapper.so
|
||||
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
|
||||
@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5926bdebb3d02686d81b3e29edbdf3fb89e44f1518ae187a66284175fb980613
|
||||
size 1230336
|
||||
@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:97eb854ba561c25d46c3e9a074dc5a8ba873923fd28d649c869d9db450e26a8a
|
||||
size 3488
|
||||
@ -1,3 +0,0 @@
|
||||
3082017cee538017c343567d938bb106 tensorrt_llm_nvrtc_wrapper.lib
|
||||
b9b4bf6a2d38abae1d3e038ad73b3890 tensorrt_llm_nvrtc_wrapper.dll
|
||||
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2c9e20ddfa9d8b200a595a3a59afd53356afe4ad1b030f0b2cf446cac7271c58
|
||||
size 53382370
|
||||
oid sha256:cb21d66bb8b8eec2f6e11696b2b9b4b629b92ab299fec6702f2102277bb453bf
|
||||
size 53355202
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:b615014ee1f3e9ef8fd498a72ef8f699b92727177416c05d08bbf1c95d3ff52e
|
||||
size 53479636
|
||||
oid sha256:7be13e39772baa2ade76d60407fa0d56ecca58c39b24e020f1f0b58c0eede5f0
|
||||
size 53469348
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
32e01331abfcacf5b70854104ca4bf20 libtensorrt_llm_internal_cutlass_kernels_static.a
|
||||
662d22cefd410c1851ac701e6e3bcbbf libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
|
||||
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
|
||||
6bf0ba4e9b8b1152a21316243d30bec6 libtensorrt_llm_internal_cutlass_kernels_static.a
|
||||
96f8a359c84a78ba415f4d98ef1c4e1d libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
|
||||
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:baf28ca8666062b417d251b608f31227cfac15676aa0bbbaacb1263befc9d408
|
||||
size 68138662
|
||||
oid sha256:d5cb27ba31185f16333b697bf9d913015ee85508e96aea2248162f3ff9a618b9
|
||||
size 68126454
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:05358d9334e8976d463a40b4c6b0b5d780561556cb89194a8f243b0b69f59f33
|
||||
size 68308080
|
||||
oid sha256:c22c8b6856111183fc44fb11c8843ea8506f7297a97fee1e9a98414f9526118a
|
||||
size 68295728
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
d8af682c4274543b06992255e727f52e libtensorrt_llm_internal_cutlass_kernels_static.a
|
||||
c797baf2a0a7538eb8f75e0f898ae208 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
|
||||
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
|
||||
64df74edb7e87b69478e4f9a2c0b3bb7 libtensorrt_llm_internal_cutlass_kernels_static.a
|
||||
68a43f555a5b930950a436ebb54a1267 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
|
||||
3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
|
||||
@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:b3856b1a5ae0f8f8e489f79d6396f2cc2e823536728b042082892502bcd33d76
|
||||
size 246691546
|
||||
@ -1,2 +0,0 @@
|
||||
ab51496e515622f560f4b989ed1d7e63 tensorrt_llm_internal_cutlass_kernels_static.lib
|
||||
f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
|
||||
@ -213,8 +213,9 @@ __global__ void batchApplyPenalty(T const* const* inputLogits, T* outputLogits,
|
||||
if (hasMinLength)
|
||||
{
|
||||
__syncthreads();
|
||||
// Min length
|
||||
if ((threadIdx.x == 0) && (currentStep - inputLen < minLength))
|
||||
// If current generation length is too short, make sure EOS doesn't have high probability.
|
||||
// This check is not needed when endId is already -1 as generation won't stop on EOS anyway.
|
||||
if ((threadIdx.x == 0) && (currentStep - inputLen < minLength) && endIds[batchSlot] > -1)
|
||||
{
|
||||
outLogitsPtr[endIds[batchSlot]] = MASK_VAL;
|
||||
}
|
||||
|
||||
@ -79,7 +79,8 @@ __global__ void topKStage1(T const* __restrict logProbs, T const* const* __restr
|
||||
if (tid < k && endIds != nullptr) // if returnAllSelectedToken, endIds would not be an input
|
||||
{
|
||||
auto const index = tmpTopKBufIndex + tid;
|
||||
if (blockLane == 0 && tid == 0)
|
||||
// endId=-1 means generation doesn't stop upon encountering a certain token.
|
||||
if (blockLane == 0 && tid == 0 && endIds[batchSlot] > -1)
|
||||
{
|
||||
auto const endId = endIds[batchSlot];
|
||||
topKTmpIdBuf[index] = tmpLogBufIndex + endId;
|
||||
|
||||
@ -252,29 +252,49 @@ __device__ __forceinline__ void MULTIMEM_ST2(ValType& val, PtrType ptr)
|
||||
asm volatile("multimem.st.global.v2.f32 [%0], {%1,%2};" ::"l"(ptr), "r"(val.x), "r"(val.y) : "memory");
|
||||
}
|
||||
|
||||
template <typename DType, typename ValType, typename PtrType>
|
||||
template <typename DType, bool const DISABLE_FP32_ACC, typename ValType, typename PtrType>
|
||||
__device__ __forceinline__ void MULTIMEM_LD(ValType& val, PtrType ptr)
|
||||
{
|
||||
if constexpr (std::is_same_v<DType, half>)
|
||||
{
|
||||
asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];"
|
||||
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
|
||||
: "l"(ptr)
|
||||
: "memory");
|
||||
if (!DISABLE_FP32_ACC)
|
||||
{
|
||||
asm("multimem.ld_reduce.global.add.v4.f16x2.acc::f32 {%0,%1,%2,%3}, [%4];"
|
||||
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
|
||||
: "l"(ptr)
|
||||
: "memory");
|
||||
}
|
||||
else
|
||||
{
|
||||
asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];"
|
||||
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
|
||||
: "l"(ptr)
|
||||
: "memory");
|
||||
}
|
||||
}
|
||||
#ifdef ENABLE_BF16
|
||||
if constexpr (std::is_same_v<DType, __nv_bfloat16>)
|
||||
{
|
||||
asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];"
|
||||
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
|
||||
: "l"(ptr)
|
||||
: "memory");
|
||||
if (!DISABLE_FP32_ACC)
|
||||
{
|
||||
asm("multimem.ld_reduce.global.add.v4.bf16x2.acc::f32 {%0,%1,%2,%3}, [%4];"
|
||||
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
|
||||
: "l"(ptr)
|
||||
: "memory");
|
||||
}
|
||||
else
|
||||
{
|
||||
asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];"
|
||||
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
|
||||
: "l"(ptr)
|
||||
: "memory");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// All MC kernels here
|
||||
template <typename DType, int RANKS>
|
||||
template <typename DType, int RANKS, bool DISABLE_FP32_ACC>
|
||||
__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc(int const op, int const flagoffset,
|
||||
int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
|
||||
void** commbuff, int const handleridx, float4* mc_ptr)
|
||||
@ -310,7 +330,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
|
||||
uint4 val[UNROLL_MC];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL_MC; i++)
|
||||
MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
|
||||
MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL_MC; i++)
|
||||
MULTIMEM_ST(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
|
||||
@ -318,7 +338,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
|
||||
for (int line = end_aligned; line < end_elem; line += loop_step0)
|
||||
{
|
||||
uint4 val;
|
||||
MULTIMEM_LD<DType>(val, mc_ptr + (lineoffset + line));
|
||||
MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val, mc_ptr + (lineoffset + line));
|
||||
MULTIMEM_ST(val, mc_ptr + (lineoffset + line));
|
||||
}
|
||||
__syncthreads();
|
||||
@ -336,7 +356,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
|
||||
} // fp16 inplace reduce kernel (Hopper) MC
|
||||
|
||||
#else
|
||||
template <typename DType, int RANKS>
|
||||
template <typename DType, int RANKS, bool DISABLE_FP32_ACC>
|
||||
__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc(int const op, int const flagoffset,
|
||||
int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
|
||||
void** commbuff, int const handleridx, float4* mc_ptr)
|
||||
@ -382,8 +402,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
|
||||
reinterpret_cast<void*>(&arg3), reinterpret_cast<void*>(&arg4), reinterpret_cast<void*>(&arg5), \
|
||||
reinterpret_cast<void*>(&arg6), reinterpret_cast<void*>(&arg7), reinterpret_cast<void*>(&arg8), \
|
||||
reinterpret_cast<void*>(&arg9), reinterpret_cast<void*>(&arg10)}; \
|
||||
TLLM_CUDA_CHECK( \
|
||||
cudaLaunchKernelExC(&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc<DType, x>), kernelArgs)); \
|
||||
TLLM_CUDA_CHECK(cudaLaunchKernelExC( \
|
||||
&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc<DType, x, DISABLE_FP32_ACC>), kernelArgs)); \
|
||||
}
|
||||
|
||||
struct LaunchConfig
|
||||
@ -529,7 +549,7 @@ __device__ uint32_t cvt_warp_fp16_to_fp4_mc(PackedVec<Type>& vec, float SFScaleV
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename DType, int UNROLL_NLINES>
|
||||
template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
|
||||
__global__ void __launch_bounds__(MAX_THREADS)
|
||||
userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4(int const op, int const flagoffset, int const firstrank,
|
||||
int const myrank, int const gpustep, size_t const lineoffset, int const numlines, void** commbuff,
|
||||
@ -574,7 +594,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
|
||||
DType* x = reinterpret_cast<DType*>(&val[0]);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL_NLINES; i++)
|
||||
MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
|
||||
MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
|
||||
|
||||
if (residual_in != nullptr)
|
||||
{
|
||||
@ -643,7 +663,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename DType, int UNROLL_NLINES>
|
||||
template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
|
||||
__global__ void __launch_bounds__(MAX_THREADS)
|
||||
userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot(int const op, int const flagoffset,
|
||||
int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
|
||||
@ -687,7 +707,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
|
||||
DType* x = reinterpret_cast<DType*>(&val[0]);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL_NLINES; i++)
|
||||
MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
|
||||
MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
|
||||
|
||||
if (residual_in != nullptr)
|
||||
{
|
||||
@ -744,7 +764,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
|
||||
|
||||
#if __CUDA_ARCH__ >= 900
|
||||
|
||||
template <typename DType, int UNROLL_NLINES>
|
||||
template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
|
||||
__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant(int const op,
|
||||
int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
|
||||
int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
|
||||
@ -786,7 +806,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
|
||||
DType* x = reinterpret_cast<DType*>(&val[0]);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL_NLINES; i++)
|
||||
MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
|
||||
MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
|
||||
|
||||
if (residual_in != nullptr)
|
||||
{
|
||||
@ -848,7 +868,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
|
||||
*reduceidptr = reduce_id;
|
||||
} // quant kernel fp16->fp8 twoshot
|
||||
|
||||
template <typename DType, int UNROLL_NLINES>
|
||||
template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
|
||||
__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot(int const op,
|
||||
int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
|
||||
int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
|
||||
@ -889,7 +909,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
|
||||
DType* x = reinterpret_cast<DType*>(&val[0]);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < UNROLL_NLINES; i++)
|
||||
MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
|
||||
MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
|
||||
|
||||
if (residual_in != nullptr)
|
||||
{
|
||||
@ -997,7 +1017,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
|
||||
} // residual allgather kernel
|
||||
|
||||
#else
|
||||
template <typename DType, int UNROLL_NLINES>
|
||||
template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
|
||||
__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant(int const op,
|
||||
int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
|
||||
int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
|
||||
@ -1018,7 +1038,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
|
||||
asm volatile("brkpt;\n");
|
||||
}
|
||||
|
||||
template <typename DType, int UNROLL_NLINES>
|
||||
template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
|
||||
__global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot(int const op,
|
||||
int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
|
||||
int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
|
||||
@ -1059,8 +1079,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
|
||||
reinterpret_cast<void*>(&arg12), reinterpret_cast<void*>(&arg13), reinterpret_cast<void*>(&arg14), \
|
||||
reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17), \
|
||||
reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20)}; \
|
||||
TLLM_CUDA_CHECK(cudaLaunchKernelExC( \
|
||||
&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant<DType, x>), kernelArgs)); \
|
||||
TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg, \
|
||||
(void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant<DType, x, DISABLE_FP32_ACC>), kernelArgs)); \
|
||||
}
|
||||
|
||||
#define callranksMC_RMSNORM_QUANT_ONESHOT(x) \
|
||||
@ -1091,8 +1111,9 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
|
||||
reinterpret_cast<void*>(&arg12), reinterpret_cast<void*>(&arg13), reinterpret_cast<void*>(&arg14), \
|
||||
reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17), \
|
||||
reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20)}; \
|
||||
TLLM_CUDA_CHECK(cudaLaunchKernelExC( \
|
||||
&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot<DType, x>), kernelArgs)); \
|
||||
TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg, \
|
||||
(void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot<DType, x, DISABLE_FP32_ACC>), \
|
||||
kernelArgs)); \
|
||||
}
|
||||
|
||||
#define callranksMC_RMSNORM_QUANT_FP4(x) \
|
||||
@ -1127,8 +1148,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
|
||||
reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17), \
|
||||
reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20), \
|
||||
reinterpret_cast<void*>(&arg21), reinterpret_cast<void*>(&arg22), reinterpret_cast<void*>(&arg23)}; \
|
||||
TLLM_CUDA_CHECK(cudaLaunchKernelExC( \
|
||||
&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4<DType, x>), kernelArgs)); \
|
||||
TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg, \
|
||||
(void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4<DType, x, DISABLE_FP32_ACC>), kernelArgs)); \
|
||||
}
|
||||
|
||||
#define callranksMC_RMSNORM_QUANT_FP4_ONESHOT(x) \
|
||||
@ -1163,8 +1184,9 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
|
||||
reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17), \
|
||||
reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20), \
|
||||
reinterpret_cast<void*>(&arg21), reinterpret_cast<void*>(&arg22)}; \
|
||||
TLLM_CUDA_CHECK(cudaLaunchKernelExC( \
|
||||
&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot<DType, x>), kernelArgs)); \
|
||||
TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg, \
|
||||
(void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot<DType, x, DISABLE_FP32_ACC>), \
|
||||
kernelArgs)); \
|
||||
}
|
||||
#define callranksMC_RES_AG(x) \
|
||||
if (nlines == x) \
|
||||
@ -1189,7 +1211,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
|
||||
&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_res_allgather<DType, x>), kernelArgs)); \
|
||||
}
|
||||
|
||||
template <typename DType>
|
||||
template <typename DType, bool DISABLE_FP32_ACC>
|
||||
int allreduce2_userbuff_inplace_gpu(int const maxcredit, int const handler, size_t const offset, size_t const elements,
|
||||
int const blocksize, communicator* comm, cudaStream_t stream, int op)
|
||||
{
|
||||
@ -1226,7 +1248,7 @@ int allreduce2_userbuff_inplace_gpu(int const maxcredit, int const handler, size
|
||||
return sms;
|
||||
}
|
||||
|
||||
template <typename DType>
|
||||
template <typename DType, bool DISABLE_FP32_ACC>
|
||||
void allreduce_nonsharp_inplace(
|
||||
int const handler, size_t const offset, size_t const elements, communicator* comm, cudaStream_t stream, int op)
|
||||
{
|
||||
@ -1234,14 +1256,25 @@ void allreduce_nonsharp_inplace(
|
||||
return;
|
||||
int blocksize = elements * 2;
|
||||
int maxcredit = 0;
|
||||
int sms = allreduce2_userbuff_inplace_gpu<DType>(maxcredit, handler, offset, elements, blocksize, comm, stream, op);
|
||||
int sms;
|
||||
if (DISABLE_FP32_ACC)
|
||||
{
|
||||
sms = allreduce2_userbuff_inplace_gpu<DType, true>(
|
||||
maxcredit, handler, offset, elements, blocksize, comm, stream, op);
|
||||
}
|
||||
else
|
||||
{
|
||||
sms = allreduce2_userbuff_inplace_gpu<DType, false>(
|
||||
maxcredit, handler, offset, elements, blocksize, comm, stream, op);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DType>
|
||||
template <typename DType, bool DISABLE_FP32_ACC>
|
||||
void allreduce2_userbuff_inplace(
|
||||
int const handler, size_t const offset, size_t const elements, communicator* comm, cudaStream_t stream)
|
||||
{
|
||||
allreduce_nonsharp_inplace<DType>(handler, offset, elements, comm, stream, userbuffers_allreduceop_nonsharp2);
|
||||
allreduce_nonsharp_inplace<DType, DISABLE_FP32_ACC>(
|
||||
handler, offset, elements, comm, stream, userbuffers_allreduceop_nonsharp2);
|
||||
}
|
||||
|
||||
bool use_oneshot_kernel(communicator* comm, size_t elements, int hidden_size)
|
||||
@ -1262,7 +1295,7 @@ bool use_oneshot_kernel(communicator* comm, size_t elements, int hidden_size)
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DType>
|
||||
template <typename DType, bool DISABLE_FP32_ACC>
|
||||
int allreduce2_userbuff_inplace_rmsnorm_quant(int const handler, size_t const offset, int const out_handler,
|
||||
size_t const out_offset, size_t const elements, int const hidden_size, void* beta, void* gamma, float eps,
|
||||
float* scalefactor, void* residual_in, void* residual_out, communicator* comm, cudaStream_t stream)
|
||||
@ -1315,7 +1348,7 @@ int allreduce2_userbuff_inplace_rmsnorm_quant(int const handler, size_t const of
|
||||
return sms;
|
||||
}
|
||||
|
||||
template <typename DType>
|
||||
template <typename DType, bool DISABLE_FP32_ACC>
|
||||
int allreduce2_userbuff_inplace_rmsnorm_quant_fp4(int const handler, size_t const offset, int const out_handler,
|
||||
size_t const out_offset, int const scale_handler, size_t const scale_offset, size_t const elements,
|
||||
int const hidden_size, void* beta, void* gamma, float eps, float* scalefactor, void* residual_in,
|
||||
@ -1422,11 +1455,31 @@ void allreduce2_userbuff_inplace_impl(int const handler, size_t const offset, si
|
||||
{
|
||||
switch (dataType)
|
||||
{
|
||||
case nvinfer1::DataType::kHALF: allreduce2_userbuff_inplace<half>(handler, offset, elements, comm, stream); break;
|
||||
case nvinfer1::DataType::kHALF:
|
||||
{
|
||||
if (kDISABLE_FP32_ACCUMULATION)
|
||||
{
|
||||
allreduce2_userbuff_inplace<half, true>(handler, offset, elements, comm, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
allreduce2_userbuff_inplace<half, false>(handler, offset, elements, comm, stream);
|
||||
}
|
||||
break;
|
||||
}
|
||||
#ifdef ENABLE_BF16
|
||||
case nvinfer1::DataType::kBF16:
|
||||
allreduce2_userbuff_inplace<__nv_bfloat16>(handler, offset, elements, comm, stream);
|
||||
{
|
||||
if (kDISABLE_FP32_ACCUMULATION)
|
||||
{
|
||||
allreduce2_userbuff_inplace<__nv_bfloat16, true>(handler, offset, elements, comm, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
allreduce2_userbuff_inplace<__nv_bfloat16, false>(handler, offset, elements, comm, stream);
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_impl");
|
||||
}
|
||||
@ -1458,14 +1511,36 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_impl(int const handler, size_t con
|
||||
switch (dataType)
|
||||
{
|
||||
case nvinfer1::DataType::kHALF:
|
||||
return allreduce2_userbuff_inplace_rmsnorm_quant<half>(handler, offset, out_handler, out_offset, elements,
|
||||
hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
|
||||
{
|
||||
if (kDISABLE_FP32_ACCUMULATION)
|
||||
{
|
||||
return allreduce2_userbuff_inplace_rmsnorm_quant<half, true>(handler, offset, out_handler, out_offset,
|
||||
elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
return allreduce2_userbuff_inplace_rmsnorm_quant<half, false>(handler, offset, out_handler, out_offset,
|
||||
elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
|
||||
}
|
||||
break;
|
||||
}
|
||||
#ifdef ENABLE_BF16
|
||||
case nvinfer1::DataType::kBF16:
|
||||
return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16>(handler, offset, out_handler, out_offset,
|
||||
elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
|
||||
{
|
||||
if (kDISABLE_FP32_ACCUMULATION)
|
||||
{
|
||||
return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16, true>(handler, offset, out_handler,
|
||||
out_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm,
|
||||
stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16, false>(handler, offset, out_handler,
|
||||
out_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm,
|
||||
stream);
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_rmsnorm_quant_impl");
|
||||
}
|
||||
@ -1479,16 +1554,36 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_impl(int const handler, size_t
|
||||
switch (dataType)
|
||||
{
|
||||
case nvinfer1::DataType::kHALF:
|
||||
return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<half>(handler, offset, out_handler, out_offset,
|
||||
scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
|
||||
residual_out, comm, stream);
|
||||
if (kDISABLE_FP32_ACCUMULATION)
|
||||
{
|
||||
return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<half, true>(handler, offset, out_handler, out_offset,
|
||||
scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
|
||||
residual_out, comm, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<half, false>(handler, offset, out_handler, out_offset,
|
||||
scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
|
||||
residual_out, comm, stream);
|
||||
}
|
||||
break;
|
||||
#ifdef ENABLE_BF16
|
||||
case nvinfer1::DataType::kBF16:
|
||||
return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16>(handler, offset, out_handler, out_offset,
|
||||
scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
|
||||
residual_out, comm, stream);
|
||||
{
|
||||
if (kDISABLE_FP32_ACCUMULATION)
|
||||
{
|
||||
return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16, true>(handler, offset, out_handler,
|
||||
out_offset, scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor,
|
||||
residual_in, residual_out, comm, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16, false>(handler, offset, out_handler,
|
||||
out_offset, scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor,
|
||||
residual_in, residual_out, comm, stream);
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_rmsnorm_quant_impl");
|
||||
}
|
||||
|
||||
@ -656,4 +656,7 @@ __inline__ __device__ T blockReduceSumV2(T* val)
|
||||
warpReduceSumV2<T, NUM>(val);
|
||||
return (T) 0.0f;
|
||||
}
|
||||
|
||||
static bool const kDISABLE_FP32_ACCUMULATION = getenv("TRTLLM_UB_AR_DISABLE_FP32_ACCUMULATION") != nullptr;
|
||||
|
||||
} // namespace tensorrt_llm::runtime::ub
|
||||
|
||||
@ -96,18 +96,7 @@ inline bool is_supported(int arch, KernelType kernel_type)
|
||||
SUPPORT(KernelType::FP16Int8PerChannel);
|
||||
SUPPORT(KernelType::FP16Int4PerChannel);
|
||||
}
|
||||
else if (arch >= 80 && arch < 90)
|
||||
{
|
||||
SUPPORT(KernelType::FP16Int8Groupwise);
|
||||
SUPPORT(KernelType::BF16Int8Groupwise);
|
||||
SUPPORT(KernelType::FP16Int4Groupwise);
|
||||
SUPPORT(KernelType::BF16Int4Groupwise);
|
||||
SUPPORT(KernelType::FP16Int8PerChannel);
|
||||
SUPPORT(KernelType::BF16Int8PerChannel);
|
||||
SUPPORT(KernelType::FP16Int4PerChannel);
|
||||
SUPPORT(KernelType::BF16Int4PerChannel);
|
||||
}
|
||||
else if (arch >= 90 && arch != 120)
|
||||
else if (arch >= 80)
|
||||
{
|
||||
SUPPORT(KernelType::FP16Int8Groupwise);
|
||||
SUPPORT(KernelType::BF16Int8Groupwise);
|
||||
|
||||
@ -64,7 +64,7 @@ void initBindings(pybind11::module_& m)
|
||||
|
||||
py::classh<GenLlmReq>(m, "GenericLlmRequest")
|
||||
.def("validate", &GenLlmReq::validate, py::arg("max_input_len"), py::arg("max_seq_len"),
|
||||
py::arg("max_draft_len"), py::arg("max_endocer_input_len") = std::nullopt,
|
||||
py::arg("max_draft_len"), py::arg("vocab_size_padded"), py::arg("max_endocer_input_len") = std::nullopt,
|
||||
py::arg("enable_kv_cache_reuse") = false)
|
||||
.def("set_exclude_input_from_output", &GenLlmReq::setExcludeInputFromOutput, py::arg("exclude"))
|
||||
.def("get_num_tokens", &GenLlmReq::getNumTokens, py::arg("beam"))
|
||||
|
||||
@ -96,7 +96,10 @@ def build_engines(model_cache: typing.Optional[str] = None,
|
||||
world_size: int = 1,
|
||||
clean: Optional[bool] = False):
|
||||
|
||||
for model_name in ["chatglm-6b", "chatglm2-6b", "chatglm3-6b", "glm-10b"]:
|
||||
for model_name in [
|
||||
"chatglm-6b", "chatglm2-6b", "chatglm3-6b", "glm-10b", "glm-4-9b",
|
||||
"chatglm3-6b-32k"
|
||||
]:
|
||||
is_chatglm_6b_or_glm_10b = model_name in ["chatglm-6b", "glm-10b"]
|
||||
if model_cache and (Path(model_cache) / model_name).is_dir():
|
||||
model_cache_dir = Path(model_cache) / model_name
|
||||
@ -136,20 +139,12 @@ def build_engines(model_cache: typing.Optional[str] = None,
|
||||
if ckpt_dir.is_dir():
|
||||
shutil.rmtree(ckpt_dir, ignore_errors=True)
|
||||
|
||||
# Fix HF error for ChatGLM-6B / GLM-4-9B / ChatGLM2-6B, hope to remove this in the future
|
||||
if model_name == "chatglm-6b":
|
||||
# Fix HF error for ChatGLM-6B / GLM-4-9B / ChatGLM2-6B / ChatGLM3-6B-32K, hope to remove this in the future
|
||||
if model_name in [
|
||||
"chatglm-6b", "glm-4-9b", "chatglm2-6b", "chatglm3-6b-32k"
|
||||
]:
|
||||
shutil.copy(
|
||||
chatglm_example_dir / "chatglm-6b/tokenization_chatglm.py",
|
||||
hf_dir,
|
||||
)
|
||||
if model_name == "glm-4-9b":
|
||||
shutil.copy(
|
||||
chatglm_example_dir / "glm-4-9b/tokenization_chatglm.py",
|
||||
hf_dir,
|
||||
)
|
||||
if model_name == "chatglm2-6b":
|
||||
shutil.copy(
|
||||
chatglm_example_dir / "chatglm2-6b/tokenization_chatglm.py",
|
||||
chatglm_example_dir / f"{model_name}/tokenization_chatglm.py",
|
||||
hf_dir,
|
||||
)
|
||||
|
||||
|
||||
@ -530,12 +530,6 @@ def prepare_multi_gpu_model_tests(python_exe: str,
|
||||
resources_dir=resources_dir,
|
||||
model_cache_arg=model_cache_arg)
|
||||
|
||||
prepare_model_tests(model_name="chatglm",
|
||||
python_exe=python_exe,
|
||||
root_dir=root_dir,
|
||||
resources_dir=resources_dir,
|
||||
model_cache_arg=model_cache_arg)
|
||||
|
||||
|
||||
def prepare_model_tests(model_name: str,
|
||||
python_exe: str,
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
# Multi-stage Dockerfile
|
||||
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
|
||||
ARG BASE_TAG=25.01-py3
|
||||
ARG BASE_TAG=25.03-py3
|
||||
ARG DEVEL_IMAGE=devel
|
||||
|
||||
FROM ${BASE_IMAGE}:${BASE_TAG} as base
|
||||
FROM ${BASE_IMAGE}:${BASE_TAG} AS base
|
||||
|
||||
# https://www.gnu.org/software/bash/manual/html_node/Bash-Startup-Files.html
|
||||
# The default values come from `nvcr.io/nvidia/pytorch`
|
||||
@ -14,7 +14,10 @@ ENV GITHUB_MIRROR=$GITHUB_MIRROR
|
||||
RUN echo "Using GitHub mirror: $GITHUB_MIRROR"
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
FROM base as devel
|
||||
# Clean up the pip constraint file from the base NGC PyTorch image.
|
||||
RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true
|
||||
|
||||
FROM base AS devel
|
||||
|
||||
ARG PYTHON_VERSION="3.12.3"
|
||||
RUN echo "Using Python version: $PYTHON_VERSION"
|
||||
@ -59,7 +62,7 @@ ARG TORCH_INSTALL_TYPE="skip"
|
||||
COPY docker/common/install_pytorch.sh install_pytorch.sh
|
||||
RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
|
||||
|
||||
FROM ${DEVEL_IMAGE} as wheel
|
||||
FROM ${DEVEL_IMAGE} AS wheel
|
||||
WORKDIR /src/tensorrt_llm
|
||||
COPY benchmarks benchmarks
|
||||
COPY cpp cpp
|
||||
@ -77,7 +80,7 @@ ARG BUILD_WHEEL_ARGS="--clean --trt_root /usr/local/tensorrt --python_bindings -
|
||||
RUN --mount=type=cache,target=/root/.cache/pip --mount=type=cache,target=/root/.cache/ccache \
|
||||
python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS}
|
||||
|
||||
FROM ${DEVEL_IMAGE} as release
|
||||
FROM ${DEVEL_IMAGE} AS release
|
||||
|
||||
# Create a cache directory for pip
|
||||
RUN mkdir -p /root/.cache/pip
|
||||
|
||||
@ -152,16 +152,16 @@ jenkins-aarch64_%: STAGE = devel
|
||||
jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell grep 'LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
|
||||
jenkins-rockylinux8_%: STAGE = devel
|
||||
jenkins-rockylinux8_%: BASE_IMAGE = nvidia/cuda
|
||||
jenkins-rockylinux8_%: BASE_TAG = 12.6.3-devel-rockylinux8
|
||||
jenkins-rockylinux8_%: BASE_TAG = 12.8.1-devel-rockylinux8
|
||||
|
||||
rockylinux8_%: STAGE = devel
|
||||
rockylinux8_%: BASE_IMAGE = nvidia/cuda
|
||||
rockylinux8_%: BASE_TAG = 12.6.3-devel-rockylinux8
|
||||
rockylinux8_%: BASE_TAG = 12.8.1-devel-rockylinux8
|
||||
|
||||
# For x86_64 and aarch64
|
||||
ubuntu22_%: STAGE = devel
|
||||
ubuntu22_%: BASE_IMAGE = nvidia/cuda
|
||||
ubuntu22_%: BASE_TAG = 12.6.3-devel-ubuntu22.04
|
||||
ubuntu22_%: BASE_TAG = 12.8.1-devel-ubuntu22.04
|
||||
|
||||
trtllm_%: STAGE = release
|
||||
trtllm_%: PUSH_TO_STAGING := 0
|
||||
|
||||
@ -5,7 +5,7 @@ set -ex
|
||||
# This script is used for reinstalling CUDA on Rocky Linux 8 with the run file.
|
||||
# CUDA version is usually aligned with the latest NGC CUDA image tag.
|
||||
# Only use when public CUDA image is not ready.
|
||||
CUDA_VER="12.8.0_570.86.10"
|
||||
CUDA_VER="12.8.1_570.124.06"
|
||||
CUDA_VER_SHORT="${CUDA_VER%_*}"
|
||||
|
||||
NVCC_VERSION_OUTPUT=$(nvcc --version)
|
||||
|
||||
@ -4,9 +4,8 @@ set -ex
|
||||
|
||||
# Use latest stable version from https://pypi.org/project/torch/#history
|
||||
# and closest to the version specified in
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-01.html#rel-25-01
|
||||
# PyTorch v2.5.1 has a fix for https://github.com/pytorch/pytorch/issues/138324.
|
||||
TORCH_VERSION="2.5.1"
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html#rel-25-03
|
||||
TORCH_VERSION="2.6.0"
|
||||
SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
|
||||
|
||||
prepare_environment() {
|
||||
|
||||
@ -2,20 +2,20 @@
|
||||
|
||||
set -ex
|
||||
|
||||
TRT_VER="10.8.0.43"
|
||||
TRT_VER="10.9.0.34"
|
||||
# Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-01.html#rel-25-01
|
||||
CUDA_VER="12.8" # 12.8.0
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html#rel-25-03
|
||||
CUDA_VER="12.8" # 12.8.1
|
||||
# Keep the installation for cuDNN if users want to install PyTorch with source codes.
|
||||
# PyTorch 2.x can compile with cuDNN v9.
|
||||
CUDNN_VER="9.7.0.66-1"
|
||||
CUDNN_VER="9.8.0.87-1"
|
||||
NCCL_VER="2.25.1-1+cuda12.8"
|
||||
CUBLAS_VER="12.8.3.14-1"
|
||||
CUBLAS_VER="12.8.4.1-1"
|
||||
# Align with the pre-installed CUDA / NVCC / NVRTC versions from
|
||||
# https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
|
||||
NVRTC_VER="12.8.61-1"
|
||||
CUDA_RUNTIME="12.8.57-1"
|
||||
CUDA_DRIVER_VERSION="570.86.10-1.el8"
|
||||
NVRTC_VER="12.8.93-1"
|
||||
CUDA_RUNTIME="12.8.90-1"
|
||||
CUDA_DRIVER_VERSION="570.124.06-1.el8"
|
||||
|
||||
for i in "$@"; do
|
||||
case $i in
|
||||
@ -116,7 +116,7 @@ install_tensorrt() {
|
||||
if [ -z "$ARCH" ];then ARCH=$(uname -m);fi
|
||||
if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi
|
||||
if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
|
||||
RELEASE_URL_TRT="https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/tars/TensorRT-${TRT_VER}.Linux.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz"
|
||||
RELEASE_URL_TRT="https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.9.0/tars/TensorRT-${TRT_VER}.Linux.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz"
|
||||
fi
|
||||
wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
|
||||
tar -xf /tmp/TensorRT.tar -C /usr/local/
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
sphinx>=7.0
|
||||
sphinx-argparse
|
||||
sphinx-click
|
||||
sphinx-rtd-theme
|
||||
nvidia-sphinx-theme
|
||||
myst_parser
|
||||
breathe
|
||||
pygit2
|
||||
|
||||
@ -67,7 +67,7 @@ source_suffix = {
|
||||
'.md': 'markdown',
|
||||
}
|
||||
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
html_theme = 'nvidia_sphinx_theme'
|
||||
html_static_path = ['_static']
|
||||
|
||||
# ------------------------ C++ Doc related --------------------------
|
||||
|
||||
@ -27,8 +27,6 @@ Welcome to TensorRT-LLM's Documentation!
|
||||
|
||||
installation/linux.md
|
||||
installation/build-from-source-linux.md
|
||||
installation/windows.md
|
||||
installation/build-from-source-windows.md
|
||||
installation/grace-hopper.md
|
||||
|
||||
|
||||
|
||||
@ -1,199 +0,0 @@
|
||||
(build-from-source-windows)=
|
||||
|
||||
# Building from Source Code on Windows
|
||||
|
||||
```{note}
|
||||
This section is for advanced users. Skip this section if you plan to use the pre-built TensorRT-LLM release wheel.
|
||||
```
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. Install prerequisites listed in our [Installing on Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html) document.
|
||||
2. Install [CMake](https://cmake.org/download/), version 3.27.7 is recommended, and select the option to add it to the system path.
|
||||
3. Download and install [Visual Studio 2022](https://visualstudio.microsoft.com/).
|
||||
4. Download and unzip [TensorRT 10.8.0.43](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/zip/TensorRT-10.8.0.43.Windows.win10.cuda-12.8.zip).
|
||||
|
||||
## Building a TensorRT-LLM Docker Image
|
||||
|
||||
### Docker Desktop
|
||||
|
||||
1. Install [Docker Desktop on Windows](https://docs.docker.com/desktop/install/windows-install/).
|
||||
2. Set the following configurations:
|
||||
|
||||
1. Right-click the Docker icon in the Windows system tray (bottom right of your taskbar) and select **Switch to Windows containers...**.
|
||||
2. In the Docker Desktop settings on the **General** tab, uncheck **Use the WSL 2 based image**.
|
||||
3. On the **Docker Engine** tab, set your configuration file to:
|
||||
|
||||
```
|
||||
{
|
||||
"experimental": true
|
||||
}
|
||||
```
|
||||
|
||||
```{note}
|
||||
After building, copy the files out of your container. `docker cp` is not supported on Windows for Hyper-V based images. Unless you are using WSL 2 based images, mount a folder, for example, `trt-llm-build`, to your container when you run it for moving files between the container and host system.
|
||||
```
|
||||
|
||||
### Acquire an Image
|
||||
|
||||
The Docker container will be hosted for public download in a future release. At this time, it must be built manually. From the `TensorRT-LLM\windows\` folder, run the build command:
|
||||
|
||||
```bash
|
||||
docker build -f .\docker\Dockerfile -t tensorrt-llm-windows-build:latest .
|
||||
```
|
||||
|
||||
And your image is now ready for use.
|
||||
|
||||
### Run the Container
|
||||
|
||||
Run the container in interactive mode with your build folder mounted. Specify a memory limit with the `-m` flag. By default, the limit is 2 GB, which is not sufficient to build TensorRT-LLM.
|
||||
|
||||
```bash
|
||||
docker run -it -m 12g -v .\trt-llm-build:C:\workspace\trt-llm-build tensorrt-llm-windows-build:latest
|
||||
```
|
||||
|
||||
### Build and Extract Files
|
||||
|
||||
1. Clone and setup the TensorRT-LLM repository within the container.
|
||||
|
||||
```bash
|
||||
git clone https://github.com/NVIDIA/TensorRT-LLM.git
|
||||
cd TensorRT-LLM
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
2. Build TensorRT-LLM. This command generates `build\tensorrt_llm-*.whl`.
|
||||
|
||||
```bash
|
||||
python .\scripts\build_wheel.py -a "89-real" --trt_root C:\workspace\TensorRT-10.8.0.43\
|
||||
```
|
||||
|
||||
3. Copy or move `build\tensorrt_llm-*.whl` into your mounted folder so it can be accessed on your host machine. If you intend to use the C++ runtime, you'll also need to gather various DLLs from the build into your mounted folder. For more information, refer to [C++ Runtime Usage](#c-runtime-usage).
|
||||
|
||||
|
||||
|
||||
## Building TensorRT-LLM on Bare Metal
|
||||
|
||||
**Prerequisites**
|
||||
|
||||
1. Install all prerequisites (`git`, `python`, `CUDA`) listed in our [Installing on Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html) document.
|
||||
2. Install Nsight NVTX. TensorRT-LLM on Windows currently depends on NVTX assets that do not come packaged with the CUDA 12.8.0 installer. To install these assets, download the [CUDA 11.8 Toolkit](https://developer.nvidia.com/cuda-11-8-0-download-archive?target_os=Windows&target_arch=x86_64).
|
||||
|
||||
1. During installation, select **Advanced installation**.
|
||||
|
||||
2. Nsight NVTX is located in the CUDA drop-down.
|
||||
|
||||
3. Deselect all packages, and select **Nsight NVTX**.
|
||||
|
||||
3. Install the dependencies one of two ways:
|
||||
|
||||
1. Run the `setup_build_env.ps1` script, which installs CMake, Microsoft Visual Studio Build Tools, and TensorRT automatically with default settings.
|
||||
|
||||
1. Run PowerShell as Administrator to use the script.
|
||||
|
||||
```bash
|
||||
./setup_build_env.ps1 -TRTPath <TRT-containing-folder> [-skipCMake] [-skipVSBuildTools] [-skipTRT]
|
||||
```
|
||||
|
||||
2. Close and reopen PowerShell after running the script so that `Path` changes take effect.
|
||||
|
||||
3. Supply a directory that already exists to contain TensorRT to `-TRTPath`, for example, `-TRTPath ~/inference` may be valid, but `-TRTPath ~/inference/TensorRT` will not be valid if `TensorRT` does not exist. `-TRTPath` isn't required if `-skipTRT` is supplied.
|
||||
|
||||
2. Install the dependencies one at a time.
|
||||
|
||||
1. Install [CMake](https://cmake.org/download/), version 3.27.7 is recommended, and select the option to add it to the system path.
|
||||
2. Download and install [Visual Studio 2022](https://visualstudio.microsoft.com/). When prompted to select more Workloads, check **Desktop development with C++**.
|
||||
3. Download and unzip [TensorRT 10.8.0.43](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/zip/TensorRT-10.8.0.43.Windows.win10.cuda-12.8.zip). Move the folder to a location you can reference later, such as `%USERPROFILE%\inference\TensorRT`.
|
||||
|
||||
1. Add the libraries for TensorRT to your system's `Path` environment variable. Your `Path` should include a line like this:
|
||||
|
||||
```bash
|
||||
%USERPROFILE%\inference\TensorRT\lib
|
||||
```
|
||||
|
||||
2. Close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.
|
||||
|
||||
3. Remove existing `tensorrt` wheels first by executing
|
||||
|
||||
```bash
|
||||
pip uninstall -y tensorrt tensorrt_libs tensorrt_bindings
|
||||
pip uninstall -y nvidia-cublas-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12
|
||||
```
|
||||
|
||||
4. Install the TensorRT core libraries, run PowerShell, and use `pip` to install the Python wheel.
|
||||
|
||||
```bash
|
||||
pip install %USERPROFILE%\inference\TensorRT\python\tensorrt-*.whl
|
||||
```
|
||||
|
||||
5. Verify that your TensorRT installation is working properly.
|
||||
|
||||
```bash
|
||||
python -c "import tensorrt as trt; print(trt.__version__)"
|
||||
```
|
||||
|
||||
|
||||
**Steps**
|
||||
|
||||
1. Launch a 64-bit Developer PowerShell. From your usual PowerShell terminal, run one of the following two commands.
|
||||
|
||||
1. If you installed Visual Studio Build Tools (that is, used the `setup_build_env.ps1` script):
|
||||
|
||||
```bash
|
||||
& 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Launch-VsDevShell.ps1' -Arch amd64
|
||||
```
|
||||
|
||||
2. If you installed Visual Studio Community (e.g. via manual GUI setup):
|
||||
|
||||
```bash
|
||||
& 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Launch-VsDevShell.ps1' -Arch amd64
|
||||
```
|
||||
|
||||
2. In PowerShell, from the `TensorRT-LLM` root folder, run:
|
||||
|
||||
```bash
|
||||
python .\scripts\build_wheel.py -a "89-real" --trt_root <path_to_trt_root>
|
||||
```
|
||||
|
||||
The `-a` flag specifies the device architecture. `"89-real"` supports GeForce 40-series cards.
|
||||
|
||||
The flag `-D "ENABLE_MULTI_DEVICE=0"`, while not specified here, is implied on Windows. Multi-device inference is supported on Linux, but not on Windows.
|
||||
|
||||
This command generates `build\tensorrt_llm-*.whl`.
|
||||
|
||||
(c-runtime-usage)=
|
||||
## Linking with the TensorRT-LLM C++ Runtime
|
||||
|
||||
```{note}
|
||||
This section is for advanced users. Skip this section if you do not intend to use the TensorRT-LLM C++ runtime directly. You must build from source to use the C++ runtime.
|
||||
```
|
||||
|
||||
Building from source creates libraries that can be used if you wish to directly link against the C++ runtime for TensorRT-LLM. These libraries are also required if you wish to run C++ unit tests and some benchmarks.
|
||||
|
||||
Building from source produces the following library files.
|
||||
- `tensorrt_llm` libraries located in `cpp\build\tensorrt_llm`
|
||||
- `tensorrt_llm.dll` - Shared library
|
||||
- `tensorrt_llm.exp` - Export file
|
||||
- `tensorrt_llm.lib` - Stub for linking to `tensorrt_llm.dll`
|
||||
- Dependency libraries (these get copied to `tensorrt_llm\libs\`)
|
||||
- `nvinfer_plugin_tensorrt_llm` libraries located in `cpp\build\tensorrt_llm\plugins\`
|
||||
- `nvinfer_plugin_tensorrt_llm.dll`
|
||||
- `nvinfer_plugin_tensorrt_llm.exp`
|
||||
- `nvinfer_plugin_tensorrt_llm.lib`
|
||||
- `th_common` libraries located in `cpp\build\tensorrt_llm\thop\`
|
||||
- `th_common.dll`
|
||||
- `th_common.exp`
|
||||
- `th_common.lib`
|
||||
|
||||
The locations of the DLLs, in addition to some `torch` DLLs and `TensorRT` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these:
|
||||
|
||||
```bash
|
||||
%USERPROFILE%\inference\TensorRT\lib
|
||||
%USERPROFILE%\inference\TensorRT-LLM\cpp\build\tensorrt_llm
|
||||
%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\tensorrt_llm\libs
|
||||
%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\torch\lib
|
||||
```
|
||||
|
||||
Your `Path` additions may differ, particularly if you used the Docker method and copied all the relevant DLLs into a single folder.
|
||||
|
||||
Again, close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.
|
||||
@ -5,7 +5,7 @@
|
||||
1. Install TensorRT-LLM (tested on Ubuntu 24.04).
|
||||
|
||||
```bash
|
||||
pip3 install torch==2.5.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
|
||||
pip3 install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
|
||||
|
||||
sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm --extra-index-url https://pypi.nvidia.com
|
||||
```
|
||||
|
||||
@ -1,81 +0,0 @@
|
||||
(windows)=
|
||||
|
||||
# Installing on Windows
|
||||
|
||||
```{note}
|
||||
The Windows release of TensorRT-LLM is currently in beta.
|
||||
We recommend checking out the [v0.17.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.17.0) for the most stable experience.
|
||||
```
|
||||
|
||||
```{note}
|
||||
TensorRT-LLM on Windows only supports single-GPU execution.
|
||||
```
|
||||
|
||||
**Prerequisites**
|
||||
|
||||
1. Clone this repository using [Git for Windows](https://git-scm.com/download/win).
|
||||
|
||||
2. Install the dependencies one of two ways:
|
||||
|
||||
1. Install all dependencies together.
|
||||
|
||||
1. Run the provided PowerShell script `setup_env.ps1` located under the `/windows/` folder which installs Python and CUDA 12.8.0 automatically with default settings. Run PowerShell as Administrator to use the script.
|
||||
|
||||
```bash
|
||||
./setup_env.ps1 [-skipCUDA] [-skipPython]
|
||||
```
|
||||
|
||||
2. Close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path` modified by the `setup_env.ps1` script above.
|
||||
|
||||
2. Install the dependencies one at a time.
|
||||
|
||||
1. Install [Python 3.10](https://www.python.org/ftp/python/3.10.11/python-3.10.11-amd64.exe).
|
||||
|
||||
1. Select **Add python.exe to PATH** at the start of the installation. The installation may only add the `python` command, but not the `python3` command.
|
||||
2. Navigate to the installation path `%USERPROFILE%\AppData\Local\Programs\Python\Python310` (`AppData` is a hidden folder) and copy `python.exe` to `python3.exe`.
|
||||
|
||||
2. Install [CUDA 12.8.0 Toolkit](https://developer.nvidia.com/cuda-12-8-0-download-archive?target_os=Windows&target_arch=x86_64). Use the Express Installation option. Installation may require a restart.
|
||||
|
||||
3. If using conda environment, run the following command before installing TensorRT-LLM.
|
||||
```bash
|
||||
conda install -c conda-forge pyarrow
|
||||
```
|
||||
|
||||
|
||||
**Steps**
|
||||
|
||||
1. Install TensorRT-LLM.
|
||||
|
||||
If you have an existing TensorRT installation (from older versions of `tensorrt_llm`), please execute
|
||||
|
||||
```bash
|
||||
pip uninstall -y tensorrt tensorrt_libs tensorrt_bindings
|
||||
pip uninstall -y nvidia-cublas-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12
|
||||
```
|
||||
|
||||
before installing TensorRT-LLM with the following command.
|
||||
|
||||
```bash
|
||||
pip install tensorrt_llm==0.17.0.post1 --extra-index-url https://download.pytorch.org/whl/ --extra-index-url https://pypi.nvidia.com
|
||||
```
|
||||
|
||||
Run the following command to verify that your TensorRT-LLM installation is working properly.
|
||||
|
||||
```bash
|
||||
python -c "import tensorrt_llm; print(tensorrt_llm._utils.trt_version())"
|
||||
```
|
||||
|
||||
2. Build the model.
|
||||
3. Deploy the model.
|
||||
|
||||
**Known Issue**
|
||||
|
||||
1. `OSError: exception: access violation reading 0x0000000000000000` during `import tensorrt_llm` or `trtllm-build`.
|
||||
|
||||
This may be caused by an outdated Microsoft Visual C++ Redistributable Version. Please install
|
||||
[the latest MSVC](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170#latest-microsoft-visual-c-redistributable-version)
|
||||
and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSVC DLL).
|
||||
|
||||
2. OSError: [WinError 126] The specified module could not be found. Error loading “...\Lib\site-packages\torch\lib\fbgemm.dll” or one of its dependencies.
|
||||
|
||||
Installing the latest [Build Tools for Visual Studio 2022] (https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022) will resolve the issue.
|
||||
@ -35,7 +35,7 @@ Certain limitations might apply. Refer to the {ref}`support-matrix` for more inf
|
||||
|
||||
### Native Windows Support
|
||||
|
||||
Application developers and AI enthusiasts can now benefit from accelerated LLMs running locally on PCs and Workstations powered by NVIDIA RTX and NVIDIA GeForce RTX GPUs. Refer to the {ref}`windows` section for more information.
|
||||
Windows platform support is deprecated as of v0.18.0. All Windows-related code and functionality will be completely removed in future releases.
|
||||
|
||||
## What Can You Do With TensorRT-LLM?
|
||||
|
||||
|
||||
@ -90,7 +90,7 @@ In addition, older architectures can have limitations for newer software release
|
||||
* -
|
||||
- Hardware Compatibility
|
||||
* - Operating System
|
||||
- TensorRT-LLM requires Linux x86_64, Linux aarch64 or Windows.
|
||||
- TensorRT-LLM requires Linux x86_64 or Linux aarch64.
|
||||
* - GPU Model Architectures
|
||||
-
|
||||
- [NVIDIA Blackwell Architecture](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/)
|
||||
@ -112,9 +112,9 @@ The following table shows the supported software for TensorRT-LLM.
|
||||
* -
|
||||
- Software Compatibility
|
||||
* - Container
|
||||
- [25.01](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
|
||||
- [25.03](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
|
||||
* - TensorRT
|
||||
- [10.8](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
|
||||
- [10.9](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
|
||||
* - Precision
|
||||
-
|
||||
- Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4
|
||||
|
||||
@ -5,6 +5,23 @@
|
||||
All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/).
|
||||
|
||||
|
||||
## TensorRT-LLM Release 0.18.0
|
||||
|
||||
### Key Features and Enhancements
|
||||
- **Features that were previously available in the 0.18.0.dev pre-releases are not included in this release**.
|
||||
- [BREAKING CHANGE] Windows platform support is deprecated as of v0.18.0. All Windows-related code and functionality will be completely removed in future releases.
|
||||
|
||||
### Known Issues
|
||||
- The PyTorch workflow on SBSA is incompatible with bare metal environments like Ubuntu 24.04. Please use the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) for optimal support on SBSA platforms.
|
||||
|
||||
### Infrastructure Changes
|
||||
- The base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:25.03-py3`.
|
||||
- The base Docker image for TensorRT-LLM Backend is updated to `nvcr.io/nvidia/tritonserver:25.03-py3`.
|
||||
- The dependent TensorRT version is updated to 10.9.
|
||||
- The dependent CUDA version is updated to 12.8.1.
|
||||
- The dependent NVIDIA ModelOpt version is updated to 0.25 for Linux platform.
|
||||
|
||||
|
||||
## TensorRT-LLM Release 0.17.0
|
||||
|
||||
### Key Features and Enhancements
|
||||
|
||||
@ -33,7 +33,7 @@ Or you can try the following commands to get a quantized model by yourself:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
|
||||
cd TensorRT-Model-Optimizer/llm_ptq
|
||||
cd TensorRT-Model-Optimizer/examples/llm_ptq
|
||||
scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
|
||||
```
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.15.0
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
313
examples/chatglm/chatglm3-6b-32k/tokenization_chatglm.py
Normal file
313
examples/chatglm/chatglm3-6b-32k/tokenization_chatglm.py
Normal file
@ -0,0 +1,313 @@
|
||||
import json
|
||||
import os
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
from transformers import PreTrainedTokenizer
|
||||
from transformers.tokenization_utils_base import BatchEncoding, EncodedInput
|
||||
from transformers.utils import PaddingStrategy
|
||||
|
||||
|
||||
class SPTokenizer:
|
||||
|
||||
def __init__(self, model_path: str):
|
||||
# reload tokenizer
|
||||
assert os.path.isfile(model_path), model_path
|
||||
self.sp_model = SentencePieceProcessor(model_file=model_path)
|
||||
|
||||
# BOS / EOS token IDs
|
||||
self.n_words: int = self.sp_model.vocab_size()
|
||||
self.bos_id: int = self.sp_model.bos_id()
|
||||
self.eos_id: int = self.sp_model.eos_id()
|
||||
self.pad_id: int = self.sp_model.unk_id()
|
||||
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
|
||||
|
||||
special_tokens = [
|
||||
"[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>",
|
||||
"<|user|>", "<|assistant|>", "<|observation|>"
|
||||
]
|
||||
self.special_tokens = {}
|
||||
self.index_special_tokens = {}
|
||||
for token in special_tokens:
|
||||
self.special_tokens[token] = self.n_words
|
||||
self.index_special_tokens[self.n_words] = token
|
||||
self.n_words += 1
|
||||
|
||||
def tokenize(self, s: str):
|
||||
return self.sp_model.EncodeAsPieces(s)
|
||||
|
||||
def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
|
||||
assert type(s) is str
|
||||
t = self.sp_model.encode(s)
|
||||
if bos:
|
||||
t = [self.bos_id] + t
|
||||
if eos:
|
||||
t = t + [self.eos_id]
|
||||
return t
|
||||
|
||||
def decode(self, t: List[int]) -> str:
|
||||
text, buffer = "", []
|
||||
for token in t:
|
||||
if token in self.index_special_tokens:
|
||||
if buffer:
|
||||
text += self.sp_model.decode(buffer)
|
||||
buffer = []
|
||||
text += self.index_special_tokens[token]
|
||||
else:
|
||||
buffer.append(token)
|
||||
if buffer:
|
||||
text += self.sp_model.decode(buffer)
|
||||
return text
|
||||
|
||||
def decode_tokens(self, tokens: List[str]) -> str:
|
||||
text = self.sp_model.DecodePieces(tokens)
|
||||
return text
|
||||
|
||||
def convert_token_to_id(self, token):
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
if token in self.special_tokens:
|
||||
return self.special_tokens[token]
|
||||
return self.sp_model.PieceToId(token)
|
||||
|
||||
def convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
if index in self.index_special_tokens:
|
||||
return self.index_special_tokens[index]
|
||||
if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
|
||||
return ""
|
||||
return self.sp_model.IdToPiece(index)
|
||||
|
||||
|
||||
class ChatGLMTokenizer(PreTrainedTokenizer):
|
||||
vocab_files_names = {"vocab_file": "tokenizer.model"}
|
||||
|
||||
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
||||
|
||||
def __init__(self,
|
||||
vocab_file,
|
||||
padding_side="left",
|
||||
clean_up_tokenization_spaces=False,
|
||||
**kwargs):
|
||||
self.name = "GLMTokenizer"
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
self.tokenizer = SPTokenizer(vocab_file)
|
||||
self.special_tokens = {
|
||||
"<bos>": self.tokenizer.bos_id,
|
||||
"<eos>": self.tokenizer.eos_id,
|
||||
"<pad>": self.tokenizer.pad_id
|
||||
}
|
||||
super().__init__(
|
||||
padding_side=padding_side,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
**kwargs)
|
||||
|
||||
def get_command(self, token):
|
||||
if token in self.special_tokens:
|
||||
return self.special_tokens[token]
|
||||
assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
|
||||
return self.tokenizer.special_tokens[token]
|
||||
|
||||
@property
|
||||
def unk_token(self) -> str:
|
||||
return "<unk>"
|
||||
|
||||
@property
|
||||
def pad_token(self) -> str:
|
||||
return "<unk>"
|
||||
|
||||
@property
|
||||
def pad_token_id(self):
|
||||
return self.get_command("<pad>")
|
||||
|
||||
@property
|
||||
def eos_token(self) -> str:
|
||||
return "</s>"
|
||||
|
||||
@property
|
||||
def eos_token_id(self):
|
||||
return self.get_command("<eos>")
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self.tokenizer.n_words
|
||||
|
||||
def get_vocab(self):
|
||||
""" Returns vocab as a dict """
|
||||
vocab = {
|
||||
self._convert_id_to_token(i): i
|
||||
for i in range(self.vocab_size)
|
||||
}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def _tokenize(self, text, **kwargs):
|
||||
return self.tokenizer.tokenize(text)
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
return self.tokenizer.convert_token_to_id(token)
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
return self.tokenizer.convert_id_to_token(index)
|
||||
|
||||
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
||||
return self.tokenizer.decode_tokens(tokens)
|
||||
|
||||
def save_vocabulary(self, save_directory, filename_prefix=None):
|
||||
"""
|
||||
Save the vocabulary and special tokens file to a directory.
|
||||
|
||||
Args:
|
||||
save_directory (`str`):
|
||||
The directory in which to save the vocabulary.
|
||||
filename_prefix (`str`, *optional*):
|
||||
An optional prefix to add to the named of the saved files.
|
||||
|
||||
Returns:
|
||||
`Tuple(str)`: Paths to the files saved.
|
||||
"""
|
||||
if os.path.isdir(save_directory):
|
||||
vocab_file = os.path.join(save_directory,
|
||||
self.vocab_files_names["vocab_file"])
|
||||
else:
|
||||
vocab_file = save_directory
|
||||
|
||||
with open(self.vocab_file, 'rb') as fin:
|
||||
proto_bytes = fin.read()
|
||||
|
||||
with open(vocab_file, "wb") as writer:
|
||||
writer.write(proto_bytes)
|
||||
|
||||
return (vocab_file, )
|
||||
|
||||
def get_prefix_tokens(self):
|
||||
prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
|
||||
return prefix_tokens
|
||||
|
||||
def build_single_message(self, role, metadata, message):
|
||||
assert role in ["system", "user", "assistant", "observation"], role
|
||||
role_tokens = [self.get_command(f"<|{role}|>")
|
||||
] + self.tokenizer.encode(f"{metadata}\n")
|
||||
message_tokens = self.tokenizer.encode(message)
|
||||
tokens = role_tokens + message_tokens
|
||||
return tokens
|
||||
|
||||
def build_chat_input(self, query, history=None, role="user"):
|
||||
if history is None:
|
||||
history = []
|
||||
input_ids = []
|
||||
for item in history:
|
||||
content = item["content"]
|
||||
if item["role"] == "system" and "tools" in item:
|
||||
content = content + "\n" + json.dumps(
|
||||
item["tools"], indent=4, ensure_ascii=False)
|
||||
input_ids.extend(
|
||||
self.build_single_message(item["role"],
|
||||
item.get("metadata", ""), content))
|
||||
input_ids.extend(self.build_single_message(role, "", query))
|
||||
input_ids.extend([self.get_command("<|assistant|>")])
|
||||
return self.batch_encode_plus([input_ids],
|
||||
return_tensors="pt",
|
||||
is_split_into_words=True)
|
||||
|
||||
def build_inputs_with_special_tokens(
|
||||
self,
|
||||
token_ids_0: List[int],
|
||||
token_ids_1: Optional[List[int]] = None) -> List[int]:
|
||||
"""
|
||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
||||
adding special tokens. A BERT sequence has the following format:
|
||||
|
||||
- single sequence: `[CLS] X [SEP]`
|
||||
- pair of sequences: `[CLS] A [SEP] B [SEP]`
|
||||
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of IDs to which the special tokens will be added.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
|
||||
Returns:
|
||||
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
||||
"""
|
||||
prefix_tokens = self.get_prefix_tokens()
|
||||
token_ids_0 = prefix_tokens + token_ids_0
|
||||
if token_ids_1 is not None:
|
||||
token_ids_0 = token_ids_0 + token_ids_1 + [
|
||||
self.get_command("<eos>")
|
||||
]
|
||||
return token_ids_0
|
||||
|
||||
def _pad(
|
||||
self,
|
||||
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
padding_side: str = "left", # wili, fix for new transformers
|
||||
) -> dict:
|
||||
"""
|
||||
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
|
||||
|
||||
Args:
|
||||
encoded_inputs:
|
||||
Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
|
||||
max_length: maximum length of the returned list and optionally padding length (see below).
|
||||
Will truncate by taking into account the special tokens.
|
||||
padding_strategy: PaddingStrategy to use for padding.
|
||||
|
||||
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
||||
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
||||
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
||||
The tokenizer padding sides are defined in self.padding_side:
|
||||
|
||||
- 'left': pads on the left of the sequences
|
||||
- 'right': pads on the right of the sequences
|
||||
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||
`>= 7.5` (Volta).
|
||||
return_attention_mask:
|
||||
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||
"""
|
||||
# Load from model defaults
|
||||
assert self.padding_side == "left"
|
||||
|
||||
required_input = encoded_inputs[self.model_input_names[0]]
|
||||
seq_length = len(required_input)
|
||||
|
||||
if padding_strategy == PaddingStrategy.LONGEST:
|
||||
max_length = len(required_input)
|
||||
|
||||
if max_length is not None and pad_to_multiple_of is not None and (
|
||||
max_length % pad_to_multiple_of != 0):
|
||||
max_length = (
|
||||
(max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
|
||||
|
||||
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
|
||||
required_input) != max_length
|
||||
|
||||
# Initialize attention mask if not present.
|
||||
if "attention_mask" not in encoded_inputs:
|
||||
encoded_inputs["attention_mask"] = [1] * seq_length
|
||||
|
||||
if "position_ids" not in encoded_inputs:
|
||||
encoded_inputs["position_ids"] = list(range(seq_length))
|
||||
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
|
||||
if "attention_mask" in encoded_inputs:
|
||||
encoded_inputs["attention_mask"] = [
|
||||
0
|
||||
] * difference + encoded_inputs["attention_mask"]
|
||||
if "position_ids" in encoded_inputs:
|
||||
encoded_inputs["position_ids"] = [
|
||||
0
|
||||
] * difference + encoded_inputs["position_ids"]
|
||||
encoded_inputs[self.model_input_names[
|
||||
0]] = [self.pad_token_id] * difference + required_input
|
||||
|
||||
return encoded_inputs
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
protobuf
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets==2.14.6
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.6
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
rouge_score~=0.1.2
|
||||
sentencepiece>=0.1.99
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
rouge_score~=0.1.2
|
||||
SentencePiece~=0.1.99
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
transformers>=4.31.0
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# WAR the new posting of "nvidia-cudnn-cu12~=9.0".
|
||||
# "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
|
||||
nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
flax~=0.8.0
|
||||
# jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
|
||||
jax~=0.4.19; platform_system == "Windows"
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
rouge_score~=0.1.2
|
||||
evaluate~=0.4.1
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
-f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets==2.14.6
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets==2.14.5
|
||||
rouge_score~=0.1.2
|
||||
sentencepiece>=0.1.99
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
transformers>=4.43.0
|
||||
datasets==2.14.6
|
||||
evaluate~=0.4.1
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
rouge_score~=0.1.2
|
||||
sentencepiece>=0.1.99
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
transformers>=4.39.0
|
||||
datasets~=2.14.5
|
||||
evaluate
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
rouge_score~=0.1.2
|
||||
sentencepiece>=0.1.99
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
transformers==4.38.2
|
||||
accelerate==0.25.0
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
nemo-toolkit[all]==2.0.0rc1
|
||||
megatron-core==0.8.0
|
||||
datasets~=2.14.5
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
rouge_score~=0.1.2
|
||||
sentencepiece~=0.1.99
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets>=2.14.4
|
||||
nemo-toolkit[all]==2.0.0rc1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.16.0
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.16.0
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
git+https://github.com/google-deepmind/recurrentgemma.git
|
||||
tensorrt_llm==0.18.0
|
||||
git+https://github.com/google-deepmind/recurrentgemma.git@8a32e365
|
||||
flax>=0.8.2
|
||||
jax~=0.4.23
|
||||
orbax-checkpoint==0.5.7
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.14.5
|
||||
rouge_score~=0.1.2
|
||||
sentencepiece>=0.1.99
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets~=2.16.1
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
datasets==2.14.6
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tensorrt_llm==0.17.0.post1
|
||||
tensorrt_llm==0.18.0
|
||||
tiktoken
|
||||
datasets
|
||||
kaldialign
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user