From 3c0462002cbefde475d46acb5d5edefec92dfb6a Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Wed, 2 Apr 2025 17:01:16 +0800 Subject: [PATCH] TensorRT-LLM v0.18 release (#3231) --- .pre-commit-config.yaml | 4 +- README.md | 6 +- .../tensorrt_llm/batch_manager/GptManager.h | 1 + .../batch_manager/allocateKvCache.h | 2 - .../batch_manager/evictionPolicy.h | 2 +- .../batch_manager/kvCacheManager.h | 38 ++- .../tensorrt_llm/batch_manager/kvCacheUtils.h | 2 +- .../tensorrt_llm/batch_manager/llmRequest.h | 7 +- .../tensorrt_llm/runtime/samplingConfig.h | 1 + .../libtensorrt_llm_batch_manager_static.a | 4 +- ...sorrt_llm_batch_manager_static.pre_cxx11.a | 4 +- .../libtensorrt_llm_ucx_wrapper.so | 4 +- .../aarch64-linux-gnu/version.txt | 4 +- .../libtensorrt_llm_batch_manager_static.a | 4 +- ...sorrt_llm_batch_manager_static.pre_cxx11.a | 4 +- .../libtensorrt_llm_ucx_wrapper.so | 2 +- .../x86_64-linux-gnu/version.txt | 4 +- .../tensorrt_llm_batch_manager_static.lib | 3 - .../x86_64-windows-msvc/version.txt | 2 - .../libtensorrt_llm_executor_static.a | 4 +- ...ibtensorrt_llm_executor_static.pre_cxx11.a | 4 +- .../executor/aarch64-linux-gnu/version.txt | 6 +- .../libtensorrt_llm_executor_static.a | 4 +- ...ibtensorrt_llm_executor_static.pre_cxx11.a | 4 +- .../executor/x86_64-linux-gnu/version.txt | 6 +- .../tensorrt_llm_executor_static.lib | 3 - .../executor/x86_64-windows-msvc/version.txt | 2 - .../libtensorrt_llm_nvrtc_wrapper.so | 4 +- .../aarch64-linux-gnu/version.txt | 4 +- .../libtensorrt_llm_nvrtc_wrapper.so | 4 +- .../nvrtcWrapper/x86_64-linux-gnu/version.txt | 4 +- .../tensorrt_llm_nvrtc_wrapper.dll | 3 - .../tensorrt_llm_nvrtc_wrapper.lib | 3 - .../x86_64-windows-msvc/version.txt | 3 - ...orrt_llm_internal_cutlass_kernels_static.a | 4 +- ...nternal_cutlass_kernels_static.pre_cxx11.a | 4 +- .../aarch64-linux-gnu/version.txt | 6 +- ...orrt_llm_internal_cutlass_kernels_static.a | 4 +- ...nternal_cutlass_kernels_static.pre_cxx11.a | 4 +- .../x86_64-linux-gnu/version.txt | 6 +- ...rt_llm_internal_cutlass_kernels_static.lib | 3 - .../x86_64-windows-msvc/version.txt | 2 - cpp/tensorrt_llm/kernels/penaltyKernels.cu | 5 +- .../kernels/samplingTopKKernels.cu | 3 +- .../kernels/userbuffers/userbuffers.cu | 199 ++++++++--- cpp/tensorrt_llm/kernels/userbuffers/utils.h | 3 + .../weightOnlyBatchedGemv/kernelLauncher.h | 13 +- .../pybind/batch_manager/bindings.cpp | 2 +- .../scripts/build_chatglm_engines.py | 23 +- cpp/tests/resources/scripts/test_cpp.py | 6 - docker/Dockerfile.multi | 13 +- docker/Makefile | 6 +- docker/common/install_cuda_toolkit.sh | 2 +- docker/common/install_pytorch.sh | 5 +- docker/common/install_tensorrt.sh | 18 +- docs/requirements.txt | 2 +- docs/source/conf.py | 2 +- docs/source/index.rst | 2 - .../installation/build-from-source-windows.md | 199 ----------- docs/source/installation/grace-hopper.md | 2 +- docs/source/installation/windows.md | 81 ----- docs/source/overview.md | 2 +- docs/source/reference/support-matrix.md | 6 +- docs/source/release-notes.md | 17 + docs/source/torch.md | 2 +- examples/baichuan/requirements.txt | 2 +- examples/bloom/requirements.txt | 2 +- .../chatglm3-6b-32k/tokenization_chatglm.py | 313 ++++++++++++++++++ examples/chatglm/requirements.txt | 2 +- examples/commandr/requirements.txt | 2 +- examples/dbrx/requirements.txt | 2 +- examples/deepseek_v1/requirements.txt | 2 +- examples/draft_target_model/requirements.txt | 2 +- examples/eagle/requirements.txt | 2 +- examples/falcon/requirements.txt | 2 +- examples/gemma/requirements.txt | 2 +- examples/gpt/requirements.txt | 2 +- examples/gptj/requirements.txt | 2 +- examples/gptneox/requirements.txt | 2 +- examples/grok/requirements.txt | 2 +- examples/internlm/requirements.txt | 2 +- examples/jais/requirements.txt | 2 +- examples/llama/requirements.txt | 2 +- examples/lookahead/requirements.txt | 2 +- examples/mamba/requirements.txt | 2 +- examples/medusa/requirements.txt | 2 +- examples/mixtral/requirements.txt | 2 +- examples/mpt/requirements.txt | 2 +- examples/nemotron/requirements.txt | 2 +- examples/opt/requirements.txt | 2 +- examples/phi/requirements.txt | 2 +- examples/prompt_lookup/requirements.txt | 2 +- examples/quantization/requirements.txt | 2 +- examples/qwen/requirements.txt | 2 +- examples/qwenvl/requirements.txt | 2 +- examples/recurrentgemma/requirements.txt | 4 +- examples/redrafter/requirements.txt | 2 +- examples/skywork/requirements.txt | 2 +- examples/smaug/requirements.txt | 2 +- examples/whisper/requirements.txt | 2 +- requirements.txt | 12 +- .../_torch/attention_backend/flashinfer.py | 246 +++++++------- .../_torch/attention_backend/interface.py | 61 ++-- .../attention_backend/star_flashinfer.py | 5 +- .../_torch/attention_backend/trtllm.py | 4 +- .../_torch/attention_backend/vanilla.py | 6 +- .../models/modeling_multimodal_utils.py | 149 +++++++++ tensorrt_llm/_torch/models/modeling_vit.py | 3 +- tensorrt_llm/llmapi/_perf_evaluator.py | 4 +- .../runtime/multimodal_model_runner.py | 27 +- tensorrt_llm/version.py | 2 +- tests/_torch/test_fp4_gemm_quantize.py | 2 + tests/llmapi/test_llm_models.py | 2 +- 113 files changed, 1001 insertions(+), 707 deletions(-) delete mode 100644 cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib delete mode 100644 cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt delete mode 100644 cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib delete mode 100644 cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt delete mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll delete mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib delete mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt delete mode 100644 cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib delete mode 100644 cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt delete mode 100644 docs/source/installation/build-from-source-windows.md delete mode 100644 docs/source/installation/windows.md create mode 100644 examples/chatglm/chatglm3-6b-32k/tokenization_chatglm.py create mode 100644 tensorrt_llm/_torch/models/modeling_multimodal_utils.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6c65a0b401..4224abebba 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: isort - repo: https://github.com/Lucas-C/pre-commit-hooks.git - rev: v1.1.13 + rev: v1.5.5 hooks: - id: remove-crlf - repo: https://github.com/google/yapf @@ -24,7 +24,7 @@ repos: - id: check-yaml - id: trailing-whitespace - repo: https://github.com/PyCQA/autoflake - rev: v1.6.1 + rev: v2.3.1 hooks: - id: autoflake args: ['--in-place', '--remove-all-unused-imports', '--remove-unused-variables'] diff --git a/README.md b/README.md index dc17c77dff..da4975cf1c 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,9 @@ TensorRT-LLM [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/) [![python](https://img.shields.io/badge/python-3.12-green)](https://www.python.org/downloads/release/python-3123/) [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/) -[![cuda](https://img.shields.io/badge/cuda-12.8.0-green)](https://developer.nvidia.com/cuda-downloads) -[![trt](https://img.shields.io/badge/TRT-10.8.0-green)](https://developer.nvidia.com/tensorrt) -[![version](https://img.shields.io/badge/release-0.17.0-green)](./tensorrt_llm/version.py) +[![cuda](https://img.shields.io/badge/cuda-12.8.1-green)](https://developer.nvidia.com/cuda-downloads) +[![trt](https://img.shields.io/badge/TRT-10.9.0-green)](https://developer.nvidia.com/tensorrt) +[![version](https://img.shields.io/badge/release-0.18.0-green)](./tensorrt_llm/version.py) [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE) [Architecture](./docs/source/architecture/overview.md)   |   [Performance](./docs/source/performance/perf-overview.md)   |   [Examples](./examples/)   |   [Documentation](./docs/source/)   |   [Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing) diff --git a/cpp/include/tensorrt_llm/batch_manager/GptManager.h b/cpp/include/tensorrt_llm/batch_manager/GptManager.h index 40034b995f..aa9c5dc92c 100644 --- a/cpp/include/tensorrt_llm/batch_manager/GptManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/GptManager.h @@ -94,6 +94,7 @@ private: [[nodiscard]] SizeType32 getMaxSequenceLen() const; [[nodiscard]] SizeType32 getMaxNumSequences() const; [[nodiscard]] SizeType32 getMaxDraftLen() const; + [[nodiscard]] SizeType32 getVocabSizePadded() const; void validateLlmRequest( LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const; diff --git a/cpp/include/tensorrt_llm/batch_manager/allocateKvCache.h b/cpp/include/tensorrt_llm/batch_manager/allocateKvCache.h index 5236f902c8..5ccfe0ee11 100644 --- a/cpp/include/tensorrt_llm/batch_manager/allocateKvCache.h +++ b/cpp/include/tensorrt_llm/batch_manager/allocateKvCache.h @@ -26,8 +26,6 @@ namespace tensorrt_llm::batch_manager { -namespace tle = tensorrt_llm::executor; - class AllocateKvCache : Algorithm { using BaseKVCacheManager = tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager; diff --git a/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h index 17d558e675..2a288a42f4 100644 --- a/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h +++ b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h @@ -56,7 +56,7 @@ public: struct ExpiringBlockComparator { - inline bool operator()(BlockPtr const& a, BlockPtr const& b) const + bool operator()(BlockPtr const& a, BlockPtr const& b) const { // If two blocks expire in the same millisecond, their expiration times will be equal. As a fallback, check the // raw pointer values. diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h index 7596130fb0..fa5cc7fe9d 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h @@ -166,6 +166,8 @@ class KVCacheBlock public: using IdType = std::int32_t; + static constexpr IdType kCachedBlocksRootId = -1; + explicit KVCacheBlock(IdType blockId, kernels::KVCacheIndex blockIdx); void startScheduling(); @@ -379,6 +381,16 @@ public: return mKvCacheRetentionConfig.getDecodeDurationMs(); } + [[nodiscard]] bool getContextRequiresCyclicKvCache() const + { + return mContextRequiresCyclicKvCache; + } + + void setContextRequiresCyclicKvCache(bool contextRequiresCyclicKvCache) + { + mContextRequiresCyclicKvCache = contextRequiresCyclicKvCache; + } + private: // Request id of the sequence LlmRequest::RequestIdType mRequestId; @@ -392,6 +404,9 @@ private: runtime::ITensor::SharedPtr mCacheBlockIndices; // The retention priority to assign to decode blocks executor::KvCacheRetentionConfig mKvCacheRetentionConfig; + + // A value indicating whether or not the context is long enough to warrant the use of cyclic kv-cache. + bool mContextRequiresCyclicKvCache{false}; }; // attach metadata to a pool pointer @@ -443,7 +458,7 @@ public: SizeType32 maxNumSequences, std::shared_ptr stream, bool onboardBlocks, CacheType cacheType = CacheType::kSELF, std::optional secondaryOffloadMinPriority = std::nullopt, - std::shared_ptr eventManager = nullptr); + std::shared_ptr eventManager = nullptr, bool enableHashKey = false); ~BlockManager(); @@ -712,6 +727,9 @@ private: SizeType32 mMissedBlocks; std::set reusedBlockIds; + // Whether or not to maintain a hashmap of blocks. + bool mEnableHashKey; + private: friend class KVCacheManager; }; @@ -818,16 +836,18 @@ public: //! \details These blocks become reusable from next step. virtual void storeContextBlocks(LlmRequest const& llmRequest) = 0; - virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0; + [[nodiscard]] virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0; - virtual std::vector> const& getCacheBlockIds(LlmRequest::RequestIdType requestId) const = 0; + [[nodiscard]] virtual std::vector> const& getCacheBlockIds( + LlmRequest::RequestIdType requestId) const + = 0; - virtual std::vector>> getBatchCacheBlockIds( + [[nodiscard]] virtual std::vector>> getBatchCacheBlockIds( std::vector const& requestIds) const = 0; - virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0; - virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0; + [[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0; + [[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0; virtual void refreshBlocks() = 0; virtual void flushIterationEvents() = 0; @@ -846,7 +866,7 @@ public: * 2 * modelConfig.getSizePerHead(); } - [[nodiscard]] static std::tuple const calculateMaxNumBlocks(KvCacheConfig const& config, + [[nodiscard]] static std::tuple calculateMaxNumBlocks(KvCacheConfig const& config, nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig, tensorrt_llm::runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager); @@ -924,7 +944,7 @@ public: return mBlockManager.getNumFreeBlocks(); } - [[nodiscard]] virtual SizeType32 getNumPools() const override + [[nodiscard]] SizeType32 getNumPools() const override { return mBlockManager.getNumPools(); } @@ -994,8 +1014,6 @@ public: /// @return The number of blocks [[nodiscard]] SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const override; - void addContextTokens(LlmRequest::RequestIdType requestId, SizeType32 numTokens); - /// @brief Increase size for request with requestId. Allocate new KV cache block(s) if needed. void addToken(LlmRequest::RequestIdType requestId) override; diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h index 6febf09887..41a34fed80 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h @@ -91,7 +91,7 @@ private: runtime::ITensor::SharedPtr mPool; runtime::ITensor::SharedPtr mCurrent; - const std::vector mBlockIds; + std::vector const mBlockIds; size_t mIdx; }; diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index 4d80636647..c1aaebeca4 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -490,9 +490,14 @@ public: initialize(req.getInputTokenIds(), req.getOutputConfig().returnLogProbs); } - void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, + void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded, std::optional maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false) { + if (mEndId.has_value()) + { + TLLM_CHECK_WITH_INFO(*mEndId >= -1 && *mEndId < vocabSizePadded, + "EndId (%d) is not within acceptable range [-1, %d).", *mEndId, vocabSizePadded); + } TLLM_CHECK_WITH_INFO(!(maxEncoderInputLen.has_value() && getEncoderInputLen() > maxEncoderInputLen.value()), "Encoder length (%d) exceeds maximum encoder input length (%d).", getEncoderInputLen(), maxEncoderInputLen.value()); diff --git a/cpp/include/tensorrt_llm/runtime/samplingConfig.h b/cpp/include/tensorrt_llm/runtime/samplingConfig.h index 923dca81cd..d4b4ee3c26 100644 --- a/cpp/include/tensorrt_llm/runtime/samplingConfig.h +++ b/cpp/include/tensorrt_llm/runtime/samplingConfig.h @@ -21,6 +21,7 @@ #include "tensorrt_llm/layers/defaultDecodingParams.h" #include "tensorrt_llm/runtime/common.h" +#include #include #include #include diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a index b08c13c0cb..1e363f63a4 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54aeaec28cc8cd7e5f62829fecf5af5be192e906333b108028af951fc6b6346d -size 9125406 +oid sha256:2d361766d0a13d5d88071e546f5d7ca51fef92300fcc7b261337c638746cbff1 +size 9123884 diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a index 42ffff1710..23f1466689 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d5c2aac4ca61d80f8a61d06a7e17fcfb0c9428a3dd89ade705f14cf651acd4b -size 9169292 +oid sha256:e7a942b813cd05c5d21c82a7e5b5988227988668bf960b3a954443998b4e2a2b +size 9167324 diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_ucx_wrapper.so b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_ucx_wrapper.so index e9f4b7568b..88bcdfcdba 100755 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_ucx_wrapper.so +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_ucx_wrapper.so @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c067e858d968710bfe4575694ff5cabf1fb9d29924f6bd4cee552c4fd485a8ca -size 2026192 +oid sha256:a11179efe519b2b001d6021c7cbea949c81b3618395fa2ce44da9b09d7d35d14 +size 2029704 diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt index 3d6f0c6ee3..1730f2bc44 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -ca50ae76421863dfebf6080b7f4f6b29 libtensorrt_llm_ucx_wrapper.so -f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit \ No newline at end of file +9f9942768fd5b0cf5ed19860ad539dc9 libtensorrt_llm_ucx_wrapper.so +3c5fe5eb86077f67febc42070be11f11de17c1e2 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a index e279e66354..de1c8e8b1d 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:092faef60d09d008331b2b09453d89563b5fe6c49a6a195051ad51ca80f79277 -size 8407972 +oid sha256:e2ec997b71d8b990eecc0034930b24319916ed3615a618f982e1b780d7643bc6 +size 8408224 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a index e2cd91c68a..f6828f3a5b 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d00bce3aa2f555f98cb2213d183a3174bb083cd62179ac65dce24d75bd648eb -size 8374854 +oid sha256:2662dfb4833b41f71f4ef9cfe6a46d6312a04065c2343a899e1476102019a180 +size 8374456 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_ucx_wrapper.so b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_ucx_wrapper.so index ec4557bc66..33347341b9 100755 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_ucx_wrapper.so +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_ucx_wrapper.so @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac4ad59907a322e6fbb85b9e88cea587cc4f5e4018017726895f64bd800c8552 +oid sha256:1a1d23d99e2b91fa11ee3e1fb94179c0731065059eb981da9434a42d98ffa4d8 size 15592 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt index 13f916f8de..bc11eb8b02 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -0ec83a0451530fcf0e3f325cdc185043 libtensorrt_llm_ucx_wrapper.so -f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit \ No newline at end of file +e383212a40dca932c7b77bf4544dab80 libtensorrt_llm_ucx_wrapper.so +3c5fe5eb86077f67febc42070be11f11de17c1e2 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib deleted file mode 100644 index f5ccca6cfe..0000000000 --- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a64b3c16150b34ad6437862eaf95c5c35acdf69facc40af14bc0632a16b7d162 -size 54093198 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt deleted file mode 100644 index edb78ee9fb..0000000000 --- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt +++ /dev/null @@ -1,2 +0,0 @@ -41725f3b40ca44954bb9de6e7dcbfd2c tensorrt_llm_batch_manager_static.lib -f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a index c3faa14335..719ef20171 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d333952a574e9056a0f01f788f9c053d92a7a9bc988a335df663697405f5659 -size 3102572 +oid sha256:d6ef115e34695dd0bec9df6069dd2e95615f401546ce275b133145fdb7568c6c +size 3102764 diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a index b8d3c22ade..a41548be59 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa6dcdbe9501429192102c1094be664d0ab07199acc4882ab061eb48b699d83c -size 3145248 +oid sha256:3bc68d4aec21a361954dd144084edb050e19390b87d6668f88b9e7f110f717a0 +size 3145744 diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt index c01167e075..5d29f71099 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -15c05b1921f3f8cbb7bc1e53f189c661 libtensorrt_llm_executor_static.a -b586d90eac1293ea656ff2db8a35cd92 libtensorrt_llm_executor_static.pre_cxx11.a -f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit \ No newline at end of file +288c6aa23b993d60d758107188c37d78 libtensorrt_llm_executor_static.a +20e46fb4b5b23a0f27eb3f8dd7d4d3bf libtensorrt_llm_executor_static.pre_cxx11.a +3c5fe5eb86077f67febc42070be11f11de17c1e2 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a index 6594703a05..d568a4076a 100644 --- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a +++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4b144867eb79d269db173338ccae8dc6c68a8374a49fe17a555034ac433f46f -size 3457528 +oid sha256:5b819d5cf3f7d9bd1ee69427db4b7ce3eb9c17f9e2cfa319540dad51ed6648e7 +size 3457520 diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a index 296dcd3894..108882da9c 100644 --- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7b5f70dda7b8042d09098859a4bd1851159d3c750da97f3bfd55bf713c7a1cc -size 3447846 +oid sha256:9f123e25ff2e046afff092b082a60b461f3f13853630857bd166e5e8a084e1ee +size 3448406 diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt index 4abbd8fc13..0d7ff37de2 100644 --- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -d573456348a96fd7a97aa832f1113986 libtensorrt_llm_executor_static.a -b3ba7776e3b5bb6e750e8412fc3b5c60 libtensorrt_llm_executor_static.pre_cxx11.a -f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit \ No newline at end of file +eebaf66c6ac42645a9bf12a0b013ac4b libtensorrt_llm_executor_static.a +1fc4cc62abfb31ad51f6ae3620641c04 libtensorrt_llm_executor_static.pre_cxx11.a +3c5fe5eb86077f67febc42070be11f11de17c1e2 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib deleted file mode 100644 index 827b388048..0000000000 --- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3722133c17bdad38c3bbd3a5caa4eafbe17805d3a7fa784f10c313902020a13b -size 26350954 diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt deleted file mode 100644 index fbfc2fbcdf..0000000000 --- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt +++ /dev/null @@ -1,2 +0,0 @@ -5beaadd32fc3dd25770746016b293229 tensorrt_llm_executor_static.lib -f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so index 9d01b7d0fe..aee08cf678 100755 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41cb6788cd975f0f2ef1de0bdff9d3178280a3c2683abb11929e0d5ccecc9d76 -size 126752312 +oid sha256:f82db62eaeeb8a02d44b4cad97ae050cc447eea8a3e48b03f56f6040d1aaccc8 +size 126824176 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt index d36de4e84d..dba2e06400 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -200a2d19469277b9906a00f7da83fd04 libtensorrt_llm_nvrtc_wrapper.so -f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit \ No newline at end of file +f3143205203b038b9dca6dd32cf02f59 libtensorrt_llm_nvrtc_wrapper.so +3c5fe5eb86077f67febc42070be11f11de17c1e2 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so index 5ae764d3e1..5167fc4b57 100755 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:738668482149971eaa849b863360e21fe66781c5eeaadab8263c83a3b67637dc -size 133824576 +oid sha256:bdb100ae1f96025c5dd7e977cbae05005c2b2b3f36b902edc3a4f24d85ec3731 +size 133867944 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt index fcd6037b22..d5ce681f16 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -20761e50ba44b91a7a2d1f8d1c5c780b libtensorrt_llm_nvrtc_wrapper.so -f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit \ No newline at end of file +770ca93818f3f04837a67353e3f71fbc libtensorrt_llm_nvrtc_wrapper.so +3c5fe5eb86077f67febc42070be11f11de17c1e2 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll deleted file mode 100644 index 383dc043e8..0000000000 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5926bdebb3d02686d81b3e29edbdf3fb89e44f1518ae187a66284175fb980613 -size 1230336 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib deleted file mode 100644 index 51ec0f17b5..0000000000 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:97eb854ba561c25d46c3e9a074dc5a8ba873923fd28d649c869d9db450e26a8a -size 3488 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt deleted file mode 100644 index 788d1bb79d..0000000000 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt +++ /dev/null @@ -1,3 +0,0 @@ -3082017cee538017c343567d938bb106 tensorrt_llm_nvrtc_wrapper.lib -b9b4bf6a2d38abae1d3e038ad73b3890 tensorrt_llm_nvrtc_wrapper.dll -f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a index 435b00d8fd..4ad2b77974 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c9e20ddfa9d8b200a595a3a59afd53356afe4ad1b030f0b2cf446cac7271c58 -size 53382370 +oid sha256:cb21d66bb8b8eec2f6e11696b2b9b4b629b92ab299fec6702f2102277bb453bf +size 53355202 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a index b61ada9906..55eb2f88cd 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b615014ee1f3e9ef8fd498a72ef8f699b92727177416c05d08bbf1c95d3ff52e -size 53479636 +oid sha256:7be13e39772baa2ade76d60407fa0d56ecca58c39b24e020f1f0b58c0eede5f0 +size 53469348 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt index f8182838cb..e79ef6bc1b 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -32e01331abfcacf5b70854104ca4bf20 libtensorrt_llm_internal_cutlass_kernels_static.a -662d22cefd410c1851ac701e6e3bcbbf libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a -f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit \ No newline at end of file +6bf0ba4e9b8b1152a21316243d30bec6 libtensorrt_llm_internal_cutlass_kernels_static.a +96f8a359c84a78ba415f4d98ef1c4e1d libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +3c5fe5eb86077f67febc42070be11f11de17c1e2 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a index f25f1646be..1198abdc61 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:baf28ca8666062b417d251b608f31227cfac15676aa0bbbaacb1263befc9d408 -size 68138662 +oid sha256:d5cb27ba31185f16333b697bf9d913015ee85508e96aea2248162f3ff9a618b9 +size 68126454 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a index 3758e6c124..b3257f7cb4 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05358d9334e8976d463a40b4c6b0b5d780561556cb89194a8f243b0b69f59f33 -size 68308080 +oid sha256:c22c8b6856111183fc44fb11c8843ea8506f7297a97fee1e9a98414f9526118a +size 68295728 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt index 199022625a..67f5b21039 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -d8af682c4274543b06992255e727f52e libtensorrt_llm_internal_cutlass_kernels_static.a -c797baf2a0a7538eb8f75e0f898ae208 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a -f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit \ No newline at end of file +64df74edb7e87b69478e4f9a2c0b3bb7 libtensorrt_llm_internal_cutlass_kernels_static.a +68a43f555a5b930950a436ebb54a1267 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +3c5fe5eb86077f67febc42070be11f11de17c1e2 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib deleted file mode 100644 index 0baeacdc3d..0000000000 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b3856b1a5ae0f8f8e489f79d6396f2cc2e823536728b042082892502bcd33d76 -size 246691546 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt deleted file mode 100644 index 8ca26b27ce..0000000000 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt +++ /dev/null @@ -1,2 +0,0 @@ -ab51496e515622f560f4b989ed1d7e63 tensorrt_llm_internal_cutlass_kernels_static.lib -f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/penaltyKernels.cu b/cpp/tensorrt_llm/kernels/penaltyKernels.cu index bb78192c24..257ce8a51f 100644 --- a/cpp/tensorrt_llm/kernels/penaltyKernels.cu +++ b/cpp/tensorrt_llm/kernels/penaltyKernels.cu @@ -213,8 +213,9 @@ __global__ void batchApplyPenalty(T const* const* inputLogits, T* outputLogits, if (hasMinLength) { __syncthreads(); - // Min length - if ((threadIdx.x == 0) && (currentStep - inputLen < minLength)) + // If current generation length is too short, make sure EOS doesn't have high probability. + // This check is not needed when endId is already -1 as generation won't stop on EOS anyway. + if ((threadIdx.x == 0) && (currentStep - inputLen < minLength) && endIds[batchSlot] > -1) { outLogitsPtr[endIds[batchSlot]] = MASK_VAL; } diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu index a573d3911d..d6770adec9 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu +++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu @@ -79,7 +79,8 @@ __global__ void topKStage1(T const* __restrict logProbs, T const* const* __restr if (tid < k && endIds != nullptr) // if returnAllSelectedToken, endIds would not be an input { auto const index = tmpTopKBufIndex + tid; - if (blockLane == 0 && tid == 0) + // endId=-1 means generation doesn't stop upon encountering a certain token. + if (blockLane == 0 && tid == 0 && endIds[batchSlot] > -1) { auto const endId = endIds[batchSlot]; topKTmpIdBuf[index] = tmpLogBufIndex + endId; diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu index f797440c61..d211a99b81 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu +++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu @@ -252,29 +252,49 @@ __device__ __forceinline__ void MULTIMEM_ST2(ValType& val, PtrType ptr) asm volatile("multimem.st.global.v2.f32 [%0], {%1,%2};" ::"l"(ptr), "r"(val.x), "r"(val.y) : "memory"); } -template +template __device__ __forceinline__ void MULTIMEM_LD(ValType& val, PtrType ptr) { if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];" - : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) - : "l"(ptr) - : "memory"); + if (!DISABLE_FP32_ACC) + { + asm("multimem.ld_reduce.global.add.v4.f16x2.acc::f32 {%0,%1,%2,%3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(ptr) + : "memory"); + } + else + { + asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(ptr) + : "memory"); + } } #ifdef ENABLE_BF16 if constexpr (std::is_same_v) { - asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];" - : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) - : "l"(ptr) - : "memory"); + if (!DISABLE_FP32_ACC) + { + asm("multimem.ld_reduce.global.add.v4.bf16x2.acc::f32 {%0,%1,%2,%3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(ptr) + : "memory"); + } + else + { + asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(ptr) + : "memory"); + } } #endif } // All MC kernels here -template +template __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc(int const op, int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines, void** commbuff, int const handleridx, float4* mc_ptr) @@ -310,7 +330,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ uint4 val[UNROLL_MC]; #pragma unroll for (int i = 0; i < UNROLL_MC; i++) - MULTIMEM_LD(val[i], mc_ptr + (lineoffset + line + i * loop_step0)); + MULTIMEM_LD(val[i], mc_ptr + (lineoffset + line + i * loop_step0)); #pragma unroll for (int i = 0; i < UNROLL_MC; i++) MULTIMEM_ST(val[i], mc_ptr + (lineoffset + line + i * loop_step0)); @@ -318,7 +338,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ for (int line = end_aligned; line < end_elem; line += loop_step0) { uint4 val; - MULTIMEM_LD(val, mc_ptr + (lineoffset + line)); + MULTIMEM_LD(val, mc_ptr + (lineoffset + line)); MULTIMEM_ST(val, mc_ptr + (lineoffset + line)); } __syncthreads(); @@ -336,7 +356,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ } // fp16 inplace reduce kernel (Hopper) MC #else -template +template __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc(int const op, int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines, void** commbuff, int const handleridx, float4* mc_ptr) @@ -382,8 +402,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ reinterpret_cast(&arg3), reinterpret_cast(&arg4), reinterpret_cast(&arg5), \ reinterpret_cast(&arg6), reinterpret_cast(&arg7), reinterpret_cast(&arg8), \ reinterpret_cast(&arg9), reinterpret_cast(&arg10)}; \ - TLLM_CUDA_CHECK( \ - cudaLaunchKernelExC(&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc), kernelArgs)); \ + TLLM_CUDA_CHECK(cudaLaunchKernelExC( \ + &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc), kernelArgs)); \ } struct LaunchConfig @@ -529,7 +549,7 @@ __device__ uint32_t cvt_warp_fp16_to_fp4_mc(PackedVec& vec, float SFScaleV #endif } -template +template __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4(int const op, int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines, void** commbuff, @@ -574,7 +594,7 @@ __global__ void __launch_bounds__(MAX_THREADS) DType* x = reinterpret_cast(&val[0]); #pragma unroll for (int i = 0; i < UNROLL_NLINES; i++) - MULTIMEM_LD(val[i], mc_ptr + (lineoffset + line + i * loop_step0)); + MULTIMEM_LD(val[i], mc_ptr + (lineoffset + line + i * loop_step0)); if (residual_in != nullptr) { @@ -643,7 +663,7 @@ __global__ void __launch_bounds__(MAX_THREADS) #endif } -template +template __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot(int const op, int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines, @@ -687,7 +707,7 @@ __global__ void __launch_bounds__(MAX_THREADS) DType* x = reinterpret_cast(&val[0]); #pragma unroll for (int i = 0; i < UNROLL_NLINES; i++) - MULTIMEM_LD(val[i], mc_ptr + (lineoffset + line + i * loop_step0)); + MULTIMEM_LD(val[i], mc_ptr + (lineoffset + line + i * loop_step0)); if (residual_in != nullptr) { @@ -744,7 +764,7 @@ __global__ void __launch_bounds__(MAX_THREADS) #if __CUDA_ARCH__ >= 900 -template +template __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant(int const op, int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma, @@ -786,7 +806,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ DType* x = reinterpret_cast(&val[0]); #pragma unroll for (int i = 0; i < UNROLL_NLINES; i++) - MULTIMEM_LD(val[i], mc_ptr + (lineoffset + line + i * loop_step0)); + MULTIMEM_LD(val[i], mc_ptr + (lineoffset + line + i * loop_step0)); if (residual_in != nullptr) { @@ -848,7 +868,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ *reduceidptr = reduce_id; } // quant kernel fp16->fp8 twoshot -template +template __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot(int const op, int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma, @@ -889,7 +909,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ DType* x = reinterpret_cast(&val[0]); #pragma unroll for (int i = 0; i < UNROLL_NLINES; i++) - MULTIMEM_LD(val[i], mc_ptr + (lineoffset + line + i * loop_step0)); + MULTIMEM_LD(val[i], mc_ptr + (lineoffset + line + i * loop_step0)); if (residual_in != nullptr) { @@ -997,7 +1017,7 @@ __global__ void __launch_bounds__(MAX_THREADS) } // residual allgather kernel #else -template +template __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant(int const op, int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma, @@ -1018,7 +1038,7 @@ __global__ void __launch_bounds__(MAX_THREADS) asm volatile("brkpt;\n"); } -template +template __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot(int const op, int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma, @@ -1059,8 +1079,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ reinterpret_cast(&arg12), reinterpret_cast(&arg13), reinterpret_cast(&arg14), \ reinterpret_cast(&arg15), reinterpret_cast(&arg16), reinterpret_cast(&arg17), \ reinterpret_cast(&arg18), reinterpret_cast(&arg19), reinterpret_cast(&arg20)}; \ - TLLM_CUDA_CHECK(cudaLaunchKernelExC( \ - &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant), kernelArgs)); \ + TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg, \ + (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant), kernelArgs)); \ } #define callranksMC_RMSNORM_QUANT_ONESHOT(x) \ @@ -1091,8 +1111,9 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ reinterpret_cast(&arg12), reinterpret_cast(&arg13), reinterpret_cast(&arg14), \ reinterpret_cast(&arg15), reinterpret_cast(&arg16), reinterpret_cast(&arg17), \ reinterpret_cast(&arg18), reinterpret_cast(&arg19), reinterpret_cast(&arg20)}; \ - TLLM_CUDA_CHECK(cudaLaunchKernelExC( \ - &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot), kernelArgs)); \ + TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg, \ + (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot), \ + kernelArgs)); \ } #define callranksMC_RMSNORM_QUANT_FP4(x) \ @@ -1127,8 +1148,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ reinterpret_cast(&arg15), reinterpret_cast(&arg16), reinterpret_cast(&arg17), \ reinterpret_cast(&arg18), reinterpret_cast(&arg19), reinterpret_cast(&arg20), \ reinterpret_cast(&arg21), reinterpret_cast(&arg22), reinterpret_cast(&arg23)}; \ - TLLM_CUDA_CHECK(cudaLaunchKernelExC( \ - &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4), kernelArgs)); \ + TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg, \ + (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4), kernelArgs)); \ } #define callranksMC_RMSNORM_QUANT_FP4_ONESHOT(x) \ @@ -1163,8 +1184,9 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ reinterpret_cast(&arg15), reinterpret_cast(&arg16), reinterpret_cast(&arg17), \ reinterpret_cast(&arg18), reinterpret_cast(&arg19), reinterpret_cast(&arg20), \ reinterpret_cast(&arg21), reinterpret_cast(&arg22)}; \ - TLLM_CUDA_CHECK(cudaLaunchKernelExC( \ - &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot), kernelArgs)); \ + TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg, \ + (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot), \ + kernelArgs)); \ } #define callranksMC_RES_AG(x) \ if (nlines == x) \ @@ -1189,7 +1211,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_res_allgather), kernelArgs)); \ } -template +template int allreduce2_userbuff_inplace_gpu(int const maxcredit, int const handler, size_t const offset, size_t const elements, int const blocksize, communicator* comm, cudaStream_t stream, int op) { @@ -1226,7 +1248,7 @@ int allreduce2_userbuff_inplace_gpu(int const maxcredit, int const handler, size return sms; } -template +template void allreduce_nonsharp_inplace( int const handler, size_t const offset, size_t const elements, communicator* comm, cudaStream_t stream, int op) { @@ -1234,14 +1256,25 @@ void allreduce_nonsharp_inplace( return; int blocksize = elements * 2; int maxcredit = 0; - int sms = allreduce2_userbuff_inplace_gpu(maxcredit, handler, offset, elements, blocksize, comm, stream, op); + int sms; + if (DISABLE_FP32_ACC) + { + sms = allreduce2_userbuff_inplace_gpu( + maxcredit, handler, offset, elements, blocksize, comm, stream, op); + } + else + { + sms = allreduce2_userbuff_inplace_gpu( + maxcredit, handler, offset, elements, blocksize, comm, stream, op); + } } -template +template void allreduce2_userbuff_inplace( int const handler, size_t const offset, size_t const elements, communicator* comm, cudaStream_t stream) { - allreduce_nonsharp_inplace(handler, offset, elements, comm, stream, userbuffers_allreduceop_nonsharp2); + allreduce_nonsharp_inplace( + handler, offset, elements, comm, stream, userbuffers_allreduceop_nonsharp2); } bool use_oneshot_kernel(communicator* comm, size_t elements, int hidden_size) @@ -1262,7 +1295,7 @@ bool use_oneshot_kernel(communicator* comm, size_t elements, int hidden_size) } } -template +template int allreduce2_userbuff_inplace_rmsnorm_quant(int const handler, size_t const offset, int const out_handler, size_t const out_offset, size_t const elements, int const hidden_size, void* beta, void* gamma, float eps, float* scalefactor, void* residual_in, void* residual_out, communicator* comm, cudaStream_t stream) @@ -1315,7 +1348,7 @@ int allreduce2_userbuff_inplace_rmsnorm_quant(int const handler, size_t const of return sms; } -template +template int allreduce2_userbuff_inplace_rmsnorm_quant_fp4(int const handler, size_t const offset, int const out_handler, size_t const out_offset, int const scale_handler, size_t const scale_offset, size_t const elements, int const hidden_size, void* beta, void* gamma, float eps, float* scalefactor, void* residual_in, @@ -1422,11 +1455,31 @@ void allreduce2_userbuff_inplace_impl(int const handler, size_t const offset, si { switch (dataType) { - case nvinfer1::DataType::kHALF: allreduce2_userbuff_inplace(handler, offset, elements, comm, stream); break; + case nvinfer1::DataType::kHALF: + { + if (kDISABLE_FP32_ACCUMULATION) + { + allreduce2_userbuff_inplace(handler, offset, elements, comm, stream); + } + else + { + allreduce2_userbuff_inplace(handler, offset, elements, comm, stream); + } + break; + } #ifdef ENABLE_BF16 case nvinfer1::DataType::kBF16: - allreduce2_userbuff_inplace<__nv_bfloat16>(handler, offset, elements, comm, stream); + { + if (kDISABLE_FP32_ACCUMULATION) + { + allreduce2_userbuff_inplace<__nv_bfloat16, true>(handler, offset, elements, comm, stream); + } + else + { + allreduce2_userbuff_inplace<__nv_bfloat16, false>(handler, offset, elements, comm, stream); + } break; + } #endif default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_impl"); } @@ -1458,14 +1511,36 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_impl(int const handler, size_t con switch (dataType) { case nvinfer1::DataType::kHALF: - return allreduce2_userbuff_inplace_rmsnorm_quant(handler, offset, out_handler, out_offset, elements, - hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream); + { + if (kDISABLE_FP32_ACCUMULATION) + { + return allreduce2_userbuff_inplace_rmsnorm_quant(handler, offset, out_handler, out_offset, + elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream); + } + else + { + return allreduce2_userbuff_inplace_rmsnorm_quant(handler, offset, out_handler, out_offset, + elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream); + } break; + } #ifdef ENABLE_BF16 case nvinfer1::DataType::kBF16: - return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16>(handler, offset, out_handler, out_offset, - elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream); + { + if (kDISABLE_FP32_ACCUMULATION) + { + return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16, true>(handler, offset, out_handler, + out_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, + stream); + } + else + { + return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16, false>(handler, offset, out_handler, + out_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, + stream); + } break; + } #endif default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_rmsnorm_quant_impl"); } @@ -1479,16 +1554,36 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_impl(int const handler, size_t switch (dataType) { case nvinfer1::DataType::kHALF: - return allreduce2_userbuff_inplace_rmsnorm_quant_fp4(handler, offset, out_handler, out_offset, - scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, - residual_out, comm, stream); + if (kDISABLE_FP32_ACCUMULATION) + { + return allreduce2_userbuff_inplace_rmsnorm_quant_fp4(handler, offset, out_handler, out_offset, + scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, + residual_out, comm, stream); + } + else + { + return allreduce2_userbuff_inplace_rmsnorm_quant_fp4(handler, offset, out_handler, out_offset, + scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, + residual_out, comm, stream); + } break; #ifdef ENABLE_BF16 case nvinfer1::DataType::kBF16: - return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16>(handler, offset, out_handler, out_offset, - scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, - residual_out, comm, stream); + { + if (kDISABLE_FP32_ACCUMULATION) + { + return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16, true>(handler, offset, out_handler, + out_offset, scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, + residual_in, residual_out, comm, stream); + } + else + { + return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16, false>(handler, offset, out_handler, + out_offset, scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, + residual_in, residual_out, comm, stream); + } break; + } #endif default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_rmsnorm_quant_impl"); } diff --git a/cpp/tensorrt_llm/kernels/userbuffers/utils.h b/cpp/tensorrt_llm/kernels/userbuffers/utils.h index 60a7f714ba..c280ab72e3 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/utils.h +++ b/cpp/tensorrt_llm/kernels/userbuffers/utils.h @@ -656,4 +656,7 @@ __inline__ __device__ T blockReduceSumV2(T* val) warpReduceSumV2(val); return (T) 0.0f; } + +static bool const kDISABLE_FP32_ACCUMULATION = getenv("TRTLLM_UB_AR_DISABLE_FP32_ACCUMULATION") != nullptr; + } // namespace tensorrt_llm::runtime::ub diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h index aa27ac6b3b..73343ec2b7 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h @@ -96,18 +96,7 @@ inline bool is_supported(int arch, KernelType kernel_type) SUPPORT(KernelType::FP16Int8PerChannel); SUPPORT(KernelType::FP16Int4PerChannel); } - else if (arch >= 80 && arch < 90) - { - SUPPORT(KernelType::FP16Int8Groupwise); - SUPPORT(KernelType::BF16Int8Groupwise); - SUPPORT(KernelType::FP16Int4Groupwise); - SUPPORT(KernelType::BF16Int4Groupwise); - SUPPORT(KernelType::FP16Int8PerChannel); - SUPPORT(KernelType::BF16Int8PerChannel); - SUPPORT(KernelType::FP16Int4PerChannel); - SUPPORT(KernelType::BF16Int4PerChannel); - } - else if (arch >= 90 && arch != 120) + else if (arch >= 80) { SUPPORT(KernelType::FP16Int8Groupwise); SUPPORT(KernelType::BF16Int8Groupwise); diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index 491d362b85..ce6d8b05f2 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -64,7 +64,7 @@ void initBindings(pybind11::module_& m) py::classh(m, "GenericLlmRequest") .def("validate", &GenLlmReq::validate, py::arg("max_input_len"), py::arg("max_seq_len"), - py::arg("max_draft_len"), py::arg("max_endocer_input_len") = std::nullopt, + py::arg("max_draft_len"), py::arg("vocab_size_padded"), py::arg("max_endocer_input_len") = std::nullopt, py::arg("enable_kv_cache_reuse") = false) .def("set_exclude_input_from_output", &GenLlmReq::setExcludeInputFromOutput, py::arg("exclude")) .def("get_num_tokens", &GenLlmReq::getNumTokens, py::arg("beam")) diff --git a/cpp/tests/resources/scripts/build_chatglm_engines.py b/cpp/tests/resources/scripts/build_chatglm_engines.py index 7cd1d686fd..35efba1eb4 100644 --- a/cpp/tests/resources/scripts/build_chatglm_engines.py +++ b/cpp/tests/resources/scripts/build_chatglm_engines.py @@ -96,7 +96,10 @@ def build_engines(model_cache: typing.Optional[str] = None, world_size: int = 1, clean: Optional[bool] = False): - for model_name in ["chatglm-6b", "chatglm2-6b", "chatglm3-6b", "glm-10b"]: + for model_name in [ + "chatglm-6b", "chatglm2-6b", "chatglm3-6b", "glm-10b", "glm-4-9b", + "chatglm3-6b-32k" + ]: is_chatglm_6b_or_glm_10b = model_name in ["chatglm-6b", "glm-10b"] if model_cache and (Path(model_cache) / model_name).is_dir(): model_cache_dir = Path(model_cache) / model_name @@ -136,20 +139,12 @@ def build_engines(model_cache: typing.Optional[str] = None, if ckpt_dir.is_dir(): shutil.rmtree(ckpt_dir, ignore_errors=True) - # Fix HF error for ChatGLM-6B / GLM-4-9B / ChatGLM2-6B, hope to remove this in the future - if model_name == "chatglm-6b": + # Fix HF error for ChatGLM-6B / GLM-4-9B / ChatGLM2-6B / ChatGLM3-6B-32K, hope to remove this in the future + if model_name in [ + "chatglm-6b", "glm-4-9b", "chatglm2-6b", "chatglm3-6b-32k" + ]: shutil.copy( - chatglm_example_dir / "chatglm-6b/tokenization_chatglm.py", - hf_dir, - ) - if model_name == "glm-4-9b": - shutil.copy( - chatglm_example_dir / "glm-4-9b/tokenization_chatglm.py", - hf_dir, - ) - if model_name == "chatglm2-6b": - shutil.copy( - chatglm_example_dir / "chatglm2-6b/tokenization_chatglm.py", + chatglm_example_dir / f"{model_name}/tokenization_chatglm.py", hf_dir, ) diff --git a/cpp/tests/resources/scripts/test_cpp.py b/cpp/tests/resources/scripts/test_cpp.py index 200ed5baa8..cca154b1a0 100755 --- a/cpp/tests/resources/scripts/test_cpp.py +++ b/cpp/tests/resources/scripts/test_cpp.py @@ -530,12 +530,6 @@ def prepare_multi_gpu_model_tests(python_exe: str, resources_dir=resources_dir, model_cache_arg=model_cache_arg) - prepare_model_tests(model_name="chatglm", - python_exe=python_exe, - root_dir=root_dir, - resources_dir=resources_dir, - model_cache_arg=model_cache_arg) - def prepare_model_tests(model_name: str, python_exe: str, diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index d1b13d5f84..1593f242ac 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -1,9 +1,9 @@ # Multi-stage Dockerfile ARG BASE_IMAGE=nvcr.io/nvidia/pytorch -ARG BASE_TAG=25.01-py3 +ARG BASE_TAG=25.03-py3 ARG DEVEL_IMAGE=devel -FROM ${BASE_IMAGE}:${BASE_TAG} as base +FROM ${BASE_IMAGE}:${BASE_TAG} AS base # https://www.gnu.org/software/bash/manual/html_node/Bash-Startup-Files.html # The default values come from `nvcr.io/nvidia/pytorch` @@ -14,7 +14,10 @@ ENV GITHUB_MIRROR=$GITHUB_MIRROR RUN echo "Using GitHub mirror: $GITHUB_MIRROR" SHELL ["/bin/bash", "-c"] -FROM base as devel +# Clean up the pip constraint file from the base NGC PyTorch image. +RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true + +FROM base AS devel ARG PYTHON_VERSION="3.12.3" RUN echo "Using Python version: $PYTHON_VERSION" @@ -59,7 +62,7 @@ ARG TORCH_INSTALL_TYPE="skip" COPY docker/common/install_pytorch.sh install_pytorch.sh RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh -FROM ${DEVEL_IMAGE} as wheel +FROM ${DEVEL_IMAGE} AS wheel WORKDIR /src/tensorrt_llm COPY benchmarks benchmarks COPY cpp cpp @@ -77,7 +80,7 @@ ARG BUILD_WHEEL_ARGS="--clean --trt_root /usr/local/tensorrt --python_bindings - RUN --mount=type=cache,target=/root/.cache/pip --mount=type=cache,target=/root/.cache/ccache \ python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS} -FROM ${DEVEL_IMAGE} as release +FROM ${DEVEL_IMAGE} AS release # Create a cache directory for pip RUN mkdir -p /root/.cache/pip diff --git a/docker/Makefile b/docker/Makefile index 9ea6eac62a..a6ebd81091 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -152,16 +152,16 @@ jenkins-aarch64_%: STAGE = devel jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell grep 'LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"') jenkins-rockylinux8_%: STAGE = devel jenkins-rockylinux8_%: BASE_IMAGE = nvidia/cuda -jenkins-rockylinux8_%: BASE_TAG = 12.6.3-devel-rockylinux8 +jenkins-rockylinux8_%: BASE_TAG = 12.8.1-devel-rockylinux8 rockylinux8_%: STAGE = devel rockylinux8_%: BASE_IMAGE = nvidia/cuda -rockylinux8_%: BASE_TAG = 12.6.3-devel-rockylinux8 +rockylinux8_%: BASE_TAG = 12.8.1-devel-rockylinux8 # For x86_64 and aarch64 ubuntu22_%: STAGE = devel ubuntu22_%: BASE_IMAGE = nvidia/cuda -ubuntu22_%: BASE_TAG = 12.6.3-devel-ubuntu22.04 +ubuntu22_%: BASE_TAG = 12.8.1-devel-ubuntu22.04 trtllm_%: STAGE = release trtllm_%: PUSH_TO_STAGING := 0 diff --git a/docker/common/install_cuda_toolkit.sh b/docker/common/install_cuda_toolkit.sh index 1045cd1237..042c26a0a8 100644 --- a/docker/common/install_cuda_toolkit.sh +++ b/docker/common/install_cuda_toolkit.sh @@ -5,7 +5,7 @@ set -ex # This script is used for reinstalling CUDA on Rocky Linux 8 with the run file. # CUDA version is usually aligned with the latest NGC CUDA image tag. # Only use when public CUDA image is not ready. -CUDA_VER="12.8.0_570.86.10" +CUDA_VER="12.8.1_570.124.06" CUDA_VER_SHORT="${CUDA_VER%_*}" NVCC_VERSION_OUTPUT=$(nvcc --version) diff --git a/docker/common/install_pytorch.sh b/docker/common/install_pytorch.sh index 605420b0e7..85371b81f1 100644 --- a/docker/common/install_pytorch.sh +++ b/docker/common/install_pytorch.sh @@ -4,9 +4,8 @@ set -ex # Use latest stable version from https://pypi.org/project/torch/#history # and closest to the version specified in -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-01.html#rel-25-01 -# PyTorch v2.5.1 has a fix for https://github.com/pytorch/pytorch/issues/138324. -TORCH_VERSION="2.5.1" +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html#rel-25-03 +TORCH_VERSION="2.6.0" SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') prepare_environment() { diff --git a/docker/common/install_tensorrt.sh b/docker/common/install_tensorrt.sh index f158cb3491..e0cc092666 100644 --- a/docker/common/install_tensorrt.sh +++ b/docker/common/install_tensorrt.sh @@ -2,20 +2,20 @@ set -ex -TRT_VER="10.8.0.43" +TRT_VER="10.9.0.34" # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-01.html#rel-25-01 -CUDA_VER="12.8" # 12.8.0 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html#rel-25-03 +CUDA_VER="12.8" # 12.8.1 # Keep the installation for cuDNN if users want to install PyTorch with source codes. # PyTorch 2.x can compile with cuDNN v9. -CUDNN_VER="9.7.0.66-1" +CUDNN_VER="9.8.0.87-1" NCCL_VER="2.25.1-1+cuda12.8" -CUBLAS_VER="12.8.3.14-1" +CUBLAS_VER="12.8.4.1-1" # Align with the pre-installed CUDA / NVCC / NVRTC versions from # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html -NVRTC_VER="12.8.61-1" -CUDA_RUNTIME="12.8.57-1" -CUDA_DRIVER_VERSION="570.86.10-1.el8" +NVRTC_VER="12.8.93-1" +CUDA_RUNTIME="12.8.90-1" +CUDA_DRIVER_VERSION="570.124.06-1.el8" for i in "$@"; do case $i in @@ -116,7 +116,7 @@ install_tensorrt() { if [ -z "$ARCH" ];then ARCH=$(uname -m);fi if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi - RELEASE_URL_TRT="https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/tars/TensorRT-${TRT_VER}.Linux.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz" + RELEASE_URL_TRT="https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.9.0/tars/TensorRT-${TRT_VER}.Linux.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz" fi wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar tar -xf /tmp/TensorRT.tar -C /usr/local/ diff --git a/docs/requirements.txt b/docs/requirements.txt index b696542fed..cf58a547b5 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,7 @@ sphinx>=7.0 sphinx-argparse sphinx-click -sphinx-rtd-theme +nvidia-sphinx-theme myst_parser breathe pygit2 diff --git a/docs/source/conf.py b/docs/source/conf.py index d0434836ab..c9964c09be 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -67,7 +67,7 @@ source_suffix = { '.md': 'markdown', } -html_theme = 'sphinx_rtd_theme' +html_theme = 'nvidia_sphinx_theme' html_static_path = ['_static'] # ------------------------ C++ Doc related -------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index f4405ccf99..50f321e54a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -27,8 +27,6 @@ Welcome to TensorRT-LLM's Documentation! installation/linux.md installation/build-from-source-linux.md - installation/windows.md - installation/build-from-source-windows.md installation/grace-hopper.md diff --git a/docs/source/installation/build-from-source-windows.md b/docs/source/installation/build-from-source-windows.md deleted file mode 100644 index de31e4fcce..0000000000 --- a/docs/source/installation/build-from-source-windows.md +++ /dev/null @@ -1,199 +0,0 @@ -(build-from-source-windows)= - -# Building from Source Code on Windows - -```{note} -This section is for advanced users. Skip this section if you plan to use the pre-built TensorRT-LLM release wheel. -``` - -## Prerequisites - -1. Install prerequisites listed in our [Installing on Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html) document. -2. Install [CMake](https://cmake.org/download/), version 3.27.7 is recommended, and select the option to add it to the system path. -3. Download and install [Visual Studio 2022](https://visualstudio.microsoft.com/). -4. Download and unzip [TensorRT 10.8.0.43](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/zip/TensorRT-10.8.0.43.Windows.win10.cuda-12.8.zip). - -## Building a TensorRT-LLM Docker Image - -### Docker Desktop - -1. Install [Docker Desktop on Windows](https://docs.docker.com/desktop/install/windows-install/). -2. Set the following configurations: - - 1. Right-click the Docker icon in the Windows system tray (bottom right of your taskbar) and select **Switch to Windows containers...**. - 2. In the Docker Desktop settings on the **General** tab, uncheck **Use the WSL 2 based image**. - 3. On the **Docker Engine** tab, set your configuration file to: - - ``` - { - "experimental": true - } - ``` - -```{note} -After building, copy the files out of your container. `docker cp` is not supported on Windows for Hyper-V based images. Unless you are using WSL 2 based images, mount a folder, for example, `trt-llm-build`, to your container when you run it for moving files between the container and host system. -``` - -### Acquire an Image - -The Docker container will be hosted for public download in a future release. At this time, it must be built manually. From the `TensorRT-LLM\windows\` folder, run the build command: - -```bash -docker build -f .\docker\Dockerfile -t tensorrt-llm-windows-build:latest . -``` - -And your image is now ready for use. - -### Run the Container - -Run the container in interactive mode with your build folder mounted. Specify a memory limit with the `-m` flag. By default, the limit is 2 GB, which is not sufficient to build TensorRT-LLM. - -```bash -docker run -it -m 12g -v .\trt-llm-build:C:\workspace\trt-llm-build tensorrt-llm-windows-build:latest -``` - -### Build and Extract Files - -1. Clone and setup the TensorRT-LLM repository within the container. - -```bash -git clone https://github.com/NVIDIA/TensorRT-LLM.git -cd TensorRT-LLM -git submodule update --init --recursive -``` - -2. Build TensorRT-LLM. This command generates `build\tensorrt_llm-*.whl`. - -```bash -python .\scripts\build_wheel.py -a "89-real" --trt_root C:\workspace\TensorRT-10.8.0.43\ -``` - -3. Copy or move `build\tensorrt_llm-*.whl` into your mounted folder so it can be accessed on your host machine. If you intend to use the C++ runtime, you'll also need to gather various DLLs from the build into your mounted folder. For more information, refer to [C++ Runtime Usage](#c-runtime-usage). - - - -## Building TensorRT-LLM on Bare Metal - -**Prerequisites** - -1. Install all prerequisites (`git`, `python`, `CUDA`) listed in our [Installing on Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html) document. -2. Install Nsight NVTX. TensorRT-LLM on Windows currently depends on NVTX assets that do not come packaged with the CUDA 12.8.0 installer. To install these assets, download the [CUDA 11.8 Toolkit](https://developer.nvidia.com/cuda-11-8-0-download-archive?target_os=Windows&target_arch=x86_64). - - 1. During installation, select **Advanced installation**. - - 2. Nsight NVTX is located in the CUDA drop-down. - - 3. Deselect all packages, and select **Nsight NVTX**. - -3. Install the dependencies one of two ways: - - 1. Run the `setup_build_env.ps1` script, which installs CMake, Microsoft Visual Studio Build Tools, and TensorRT automatically with default settings. - - 1. Run PowerShell as Administrator to use the script. - - ```bash - ./setup_build_env.ps1 -TRTPath [-skipCMake] [-skipVSBuildTools] [-skipTRT] - ``` - - 2. Close and reopen PowerShell after running the script so that `Path` changes take effect. - - 3. Supply a directory that already exists to contain TensorRT to `-TRTPath`, for example, `-TRTPath ~/inference` may be valid, but `-TRTPath ~/inference/TensorRT` will not be valid if `TensorRT` does not exist. `-TRTPath` isn't required if `-skipTRT` is supplied. - - 2. Install the dependencies one at a time. - - 1. Install [CMake](https://cmake.org/download/), version 3.27.7 is recommended, and select the option to add it to the system path. - 2. Download and install [Visual Studio 2022](https://visualstudio.microsoft.com/). When prompted to select more Workloads, check **Desktop development with C++**. - 3. Download and unzip [TensorRT 10.8.0.43](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/zip/TensorRT-10.8.0.43.Windows.win10.cuda-12.8.zip). Move the folder to a location you can reference later, such as `%USERPROFILE%\inference\TensorRT`. - - 1. Add the libraries for TensorRT to your system's `Path` environment variable. Your `Path` should include a line like this: - - ```bash - %USERPROFILE%\inference\TensorRT\lib - ``` - - 2. Close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`. - - 3. Remove existing `tensorrt` wheels first by executing - - ```bash - pip uninstall -y tensorrt tensorrt_libs tensorrt_bindings - pip uninstall -y nvidia-cublas-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12 - ``` - - 4. Install the TensorRT core libraries, run PowerShell, and use `pip` to install the Python wheel. - - ```bash - pip install %USERPROFILE%\inference\TensorRT\python\tensorrt-*.whl - ``` - - 5. Verify that your TensorRT installation is working properly. - - ```bash - python -c "import tensorrt as trt; print(trt.__version__)" - ``` - - -**Steps** - -1. Launch a 64-bit Developer PowerShell. From your usual PowerShell terminal, run one of the following two commands. - - 1. If you installed Visual Studio Build Tools (that is, used the `setup_build_env.ps1` script): - - ```bash - & 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Launch-VsDevShell.ps1' -Arch amd64 - ``` - - 2. If you installed Visual Studio Community (e.g. via manual GUI setup): - - ```bash - & 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Launch-VsDevShell.ps1' -Arch amd64 - ``` - -2. In PowerShell, from the `TensorRT-LLM` root folder, run: - -```bash -python .\scripts\build_wheel.py -a "89-real" --trt_root -``` - -The `-a` flag specifies the device architecture. `"89-real"` supports GeForce 40-series cards. - -The flag `-D "ENABLE_MULTI_DEVICE=0"`, while not specified here, is implied on Windows. Multi-device inference is supported on Linux, but not on Windows. - -This command generates `build\tensorrt_llm-*.whl`. - -(c-runtime-usage)= -## Linking with the TensorRT-LLM C++ Runtime - -```{note} -This section is for advanced users. Skip this section if you do not intend to use the TensorRT-LLM C++ runtime directly. You must build from source to use the C++ runtime. -``` - -Building from source creates libraries that can be used if you wish to directly link against the C++ runtime for TensorRT-LLM. These libraries are also required if you wish to run C++ unit tests and some benchmarks. - -Building from source produces the following library files. -- `tensorrt_llm` libraries located in `cpp\build\tensorrt_llm` - - `tensorrt_llm.dll` - Shared library - - `tensorrt_llm.exp` - Export file - - `tensorrt_llm.lib` - Stub for linking to `tensorrt_llm.dll` -- Dependency libraries (these get copied to `tensorrt_llm\libs\`) - - `nvinfer_plugin_tensorrt_llm` libraries located in `cpp\build\tensorrt_llm\plugins\` - - `nvinfer_plugin_tensorrt_llm.dll` - - `nvinfer_plugin_tensorrt_llm.exp` - - `nvinfer_plugin_tensorrt_llm.lib` - - `th_common` libraries located in `cpp\build\tensorrt_llm\thop\` - - `th_common.dll` - - `th_common.exp` - - `th_common.lib` - -The locations of the DLLs, in addition to some `torch` DLLs and `TensorRT` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these: - -```bash -%USERPROFILE%\inference\TensorRT\lib -%USERPROFILE%\inference\TensorRT-LLM\cpp\build\tensorrt_llm -%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\tensorrt_llm\libs -%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\torch\lib -``` - -Your `Path` additions may differ, particularly if you used the Docker method and copied all the relevant DLLs into a single folder. - -Again, close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`. diff --git a/docs/source/installation/grace-hopper.md b/docs/source/installation/grace-hopper.md index 98d8e5189c..bc48c702eb 100644 --- a/docs/source/installation/grace-hopper.md +++ b/docs/source/installation/grace-hopper.md @@ -5,7 +5,7 @@ 1. Install TensorRT-LLM (tested on Ubuntu 24.04). ```bash - pip3 install torch==2.5.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 + pip3 install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm --extra-index-url https://pypi.nvidia.com ``` diff --git a/docs/source/installation/windows.md b/docs/source/installation/windows.md deleted file mode 100644 index 9f6bd51b71..0000000000 --- a/docs/source/installation/windows.md +++ /dev/null @@ -1,81 +0,0 @@ -(windows)= - -# Installing on Windows - -```{note} -The Windows release of TensorRT-LLM is currently in beta. -We recommend checking out the [v0.17.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.17.0) for the most stable experience. -``` - -```{note} -TensorRT-LLM on Windows only supports single-GPU execution. -``` - -**Prerequisites** - -1. Clone this repository using [Git for Windows](https://git-scm.com/download/win). - -2. Install the dependencies one of two ways: - - 1. Install all dependencies together. - - 1. Run the provided PowerShell script `setup_env.ps1` located under the `/windows/` folder which installs Python and CUDA 12.8.0 automatically with default settings. Run PowerShell as Administrator to use the script. - - ```bash - ./setup_env.ps1 [-skipCUDA] [-skipPython] - ``` - - 2. Close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path` modified by the `setup_env.ps1` script above. - - 2. Install the dependencies one at a time. - - 1. Install [Python 3.10](https://www.python.org/ftp/python/3.10.11/python-3.10.11-amd64.exe). - - 1. Select **Add python.exe to PATH** at the start of the installation. The installation may only add the `python` command, but not the `python3` command. - 2. Navigate to the installation path `%USERPROFILE%\AppData\Local\Programs\Python\Python310` (`AppData` is a hidden folder) and copy `python.exe` to `python3.exe`. - - 2. Install [CUDA 12.8.0 Toolkit](https://developer.nvidia.com/cuda-12-8-0-download-archive?target_os=Windows&target_arch=x86_64). Use the Express Installation option. Installation may require a restart. - - 3. If using conda environment, run the following command before installing TensorRT-LLM. - ```bash - conda install -c conda-forge pyarrow - ``` - - -**Steps** - -1. Install TensorRT-LLM. - - If you have an existing TensorRT installation (from older versions of `tensorrt_llm`), please execute - - ```bash - pip uninstall -y tensorrt tensorrt_libs tensorrt_bindings - pip uninstall -y nvidia-cublas-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12 - ``` - - before installing TensorRT-LLM with the following command. - - ```bash - pip install tensorrt_llm==0.17.0.post1 --extra-index-url https://download.pytorch.org/whl/ --extra-index-url https://pypi.nvidia.com - ``` - - Run the following command to verify that your TensorRT-LLM installation is working properly. - - ```bash - python -c "import tensorrt_llm; print(tensorrt_llm._utils.trt_version())" - ``` - -2. Build the model. -3. Deploy the model. - -**Known Issue** - -1. `OSError: exception: access violation reading 0x0000000000000000` during `import tensorrt_llm` or `trtllm-build`. - -This may be caused by an outdated Microsoft Visual C++ Redistributable Version. Please install -[the latest MSVC](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170#latest-microsoft-visual-c-redistributable-version) -and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSVC DLL). - -2. OSError: [WinError 126] The specified module could not be found. Error loading “...\Lib\site-packages\torch\lib\fbgemm.dll” or one of its dependencies. - -Installing the latest [Build Tools for Visual Studio 2022] (https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022) will resolve the issue. diff --git a/docs/source/overview.md b/docs/source/overview.md index 2a2f8dadae..2c8f49bdcd 100644 --- a/docs/source/overview.md +++ b/docs/source/overview.md @@ -35,7 +35,7 @@ Certain limitations might apply. Refer to the {ref}`support-matrix` for more inf ### Native Windows Support -Application developers and AI enthusiasts can now benefit from accelerated LLMs running locally on PCs and Workstations powered by NVIDIA RTX and NVIDIA GeForce RTX GPUs. Refer to the {ref}`windows` section for more information. +Windows platform support is deprecated as of v0.18.0. All Windows-related code and functionality will be completely removed in future releases. ## What Can You Do With TensorRT-LLM? diff --git a/docs/source/reference/support-matrix.md b/docs/source/reference/support-matrix.md index bcbcb196de..f072f48015 100644 --- a/docs/source/reference/support-matrix.md +++ b/docs/source/reference/support-matrix.md @@ -90,7 +90,7 @@ In addition, older architectures can have limitations for newer software release * - - Hardware Compatibility * - Operating System - - TensorRT-LLM requires Linux x86_64, Linux aarch64 or Windows. + - TensorRT-LLM requires Linux x86_64 or Linux aarch64. * - GPU Model Architectures - - [NVIDIA Blackwell Architecture](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/) @@ -112,9 +112,9 @@ The following table shows the supported software for TensorRT-LLM. * - - Software Compatibility * - Container - - [25.01](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html) + - [25.03](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html) * - TensorRT - - [10.8](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html) + - [10.9](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html) * - Precision - - Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4 diff --git a/docs/source/release-notes.md b/docs/source/release-notes.md index 8ed209badc..ea75de3c34 100644 --- a/docs/source/release-notes.md +++ b/docs/source/release-notes.md @@ -5,6 +5,23 @@ All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/). +## TensorRT-LLM Release 0.18.0 + +### Key Features and Enhancements + - **Features that were previously available in the 0.18.0.dev pre-releases are not included in this release**. + - [BREAKING CHANGE] Windows platform support is deprecated as of v0.18.0. All Windows-related code and functionality will be completely removed in future releases. + +### Known Issues + - The PyTorch workflow on SBSA is incompatible with bare metal environments like Ubuntu 24.04. Please use the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) for optimal support on SBSA platforms. + +### Infrastructure Changes + - The base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:25.03-py3`. + - The base Docker image for TensorRT-LLM Backend is updated to `nvcr.io/nvidia/tritonserver:25.03-py3`. + - The dependent TensorRT version is updated to 10.9. + - The dependent CUDA version is updated to 12.8.1. + - The dependent NVIDIA ModelOpt version is updated to 0.25 for Linux platform. + + ## TensorRT-LLM Release 0.17.0 ### Key Features and Enhancements diff --git a/docs/source/torch.md b/docs/source/torch.md index 1522d1dd3d..fc76269382 100644 --- a/docs/source/torch.md +++ b/docs/source/torch.md @@ -33,7 +33,7 @@ Or you can try the following commands to get a quantized model by yourself: ```bash git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git -cd TensorRT-Model-Optimizer/llm_ptq +cd TensorRT-Model-Optimizer/examples/llm_ptq scripts/huggingface_example.sh --model --quant fp8 --export_fmt hf ``` diff --git a/examples/baichuan/requirements.txt b/examples/baichuan/requirements.txt index 8ff315a52c..b333586508 100644 --- a/examples/baichuan/requirements.txt +++ b/examples/baichuan/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.15.0 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/bloom/requirements.txt b/examples/bloom/requirements.txt index 8c677a9944..59c42359ab 100644 --- a/examples/bloom/requirements.txt +++ b/examples/bloom/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/chatglm/chatglm3-6b-32k/tokenization_chatglm.py b/examples/chatglm/chatglm3-6b-32k/tokenization_chatglm.py new file mode 100644 index 0000000000..a260656e7d --- /dev/null +++ b/examples/chatglm/chatglm3-6b-32k/tokenization_chatglm.py @@ -0,0 +1,313 @@ +import json +import os +from typing import Dict, List, Optional, Union + +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.tokenization_utils_base import BatchEncoding, EncodedInput +from transformers.utils import PaddingStrategy + + +class SPTokenizer: + + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = [ + "[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", + "<|user|>", "<|assistant|>", "<|observation|>" + ] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + text, buffer = "", [] + for token in t: + if token in self.index_special_tokens: + if buffer: + text += self.sp_model.decode(buffer) + buffer = [] + text += self.index_special_tokens[token] + else: + buffer.append(token) + if buffer: + text += self.sp_model.decode(buffer) + return text + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens: + return self.index_special_tokens[index] + if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, + vocab_file, + padding_side="left", + clean_up_tokenization_spaces=False, + **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__( + padding_side=padding_side, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = { + self._convert_id_to_token(i): i + for i in range(self.vocab_size) + } + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join(save_directory, + self.vocab_files_names["vocab_file"]) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_bytes = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_bytes) + + return (vocab_file, ) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_single_message(self, role, metadata, message): + assert role in ["system", "user", "assistant", "observation"], role + role_tokens = [self.get_command(f"<|{role}|>") + ] + self.tokenizer.encode(f"{metadata}\n") + message_tokens = self.tokenizer.encode(message) + tokens = role_tokens + message_tokens + return tokens + + def build_chat_input(self, query, history=None, role="user"): + if history is None: + history = [] + input_ids = [] + for item in history: + content = item["content"] + if item["role"] == "system" and "tools" in item: + content = content + "\n" + json.dumps( + item["tools"], indent=4, ensure_ascii=False) + input_ids.extend( + self.build_single_message(item["role"], + item.get("metadata", ""), content)) + input_ids.extend(self.build_single_message(role, "", query)) + input_ids.extend([self.get_command("<|assistant|>")]) + return self.batch_encode_plus([input_ids], + return_tensors="pt", + is_split_into_words=True) + + def build_inputs_with_special_tokens( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [ + self.get_command("") + ] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + padding_side: str = "left", # wili, fix for new transformers + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and ( + max_length % pad_to_multiple_of != 0): + max_length = ( + (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len( + required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [ + 0 + ] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [ + 0 + ] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[ + 0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs diff --git a/examples/chatglm/requirements.txt b/examples/chatglm/requirements.txt index ccfdf00b22..9af012fb93 100644 --- a/examples/chatglm/requirements.txt +++ b/examples/chatglm/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 evaluate~=0.4.1 protobuf diff --git a/examples/commandr/requirements.txt b/examples/commandr/requirements.txt index 39a247e708..f5d7970be4 100644 --- a/examples/commandr/requirements.txt +++ b/examples/commandr/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/dbrx/requirements.txt b/examples/dbrx/requirements.txt index e05fb8b017..b5041429fe 100644 --- a/examples/dbrx/requirements.txt +++ b/examples/dbrx/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/deepseek_v1/requirements.txt b/examples/deepseek_v1/requirements.txt index 4cf08453e3..752f91c48c 100644 --- a/examples/deepseek_v1/requirements.txt +++ b/examples/deepseek_v1/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/draft_target_model/requirements.txt b/examples/draft_target_model/requirements.txt index d6aece6754..5621123750 100644 --- a/examples/draft_target_model/requirements.txt +++ b/examples/draft_target_model/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 rouge_score~=0.1.2 sentencepiece>=0.1.99 diff --git a/examples/eagle/requirements.txt b/examples/eagle/requirements.txt index bcc8468154..851cd3e45d 100644 --- a/examples/eagle/requirements.txt +++ b/examples/eagle/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 rouge_score~=0.1.2 SentencePiece~=0.1.99 diff --git a/examples/falcon/requirements.txt b/examples/falcon/requirements.txt index 012045e5f1..af1fce6b5c 100644 --- a/examples/falcon/requirements.txt +++ b/examples/falcon/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 transformers>=4.31.0 datasets~=2.14.5 evaluate~=0.4.1 diff --git a/examples/gemma/requirements.txt b/examples/gemma/requirements.txt index e3de0c99ce..9733032d05 100644 --- a/examples/gemma/requirements.txt +++ b/examples/gemma/requirements.txt @@ -2,7 +2,7 @@ # WAR the new posting of "nvidia-cudnn-cu12~=9.0". # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9". nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64" -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 flax~=0.8.0 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows" jax~=0.4.19; platform_system == "Windows" diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt index 49d198a5b6..a147c2f04f 100644 --- a/examples/gpt/requirements.txt +++ b/examples/gpt/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/gptj/requirements.txt b/examples/gptj/requirements.txt index ed02cf1ea5..e82e5403ca 100644 --- a/examples/gptj/requirements.txt +++ b/examples/gptj/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/gptneox/requirements.txt b/examples/gptneox/requirements.txt index 4ade6c457e..61e9be0a1c 100644 --- a/examples/gptneox/requirements.txt +++ b/examples/gptneox/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 rouge_score~=0.1.2 evaluate~=0.4.1 diff --git a/examples/grok/requirements.txt b/examples/grok/requirements.txt index 8bd9d2e519..1ed21f8a72 100644 --- a/examples/grok/requirements.txt +++ b/examples/grok/requirements.txt @@ -1,5 +1,5 @@ -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/internlm/requirements.txt b/examples/internlm/requirements.txt index f3384bb7d2..3583d8cc11 100644 --- a/examples/internlm/requirements.txt +++ b/examples/internlm/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets==2.14.5 rouge_score~=0.1.2 sentencepiece>=0.1.99 diff --git a/examples/jais/requirements.txt b/examples/jais/requirements.txt index 49d198a5b6..a147c2f04f 100644 --- a/examples/jais/requirements.txt +++ b/examples/jais/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/llama/requirements.txt b/examples/llama/requirements.txt index 6c8bfdb84b..817652fff7 100644 --- a/examples/llama/requirements.txt +++ b/examples/llama/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 transformers>=4.43.0 datasets==2.14.6 evaluate~=0.4.1 diff --git a/examples/lookahead/requirements.txt b/examples/lookahead/requirements.txt index d6aece6754..5621123750 100644 --- a/examples/lookahead/requirements.txt +++ b/examples/lookahead/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 rouge_score~=0.1.2 sentencepiece>=0.1.99 diff --git a/examples/mamba/requirements.txt b/examples/mamba/requirements.txt index 4c74bdb527..991a392421 100644 --- a/examples/mamba/requirements.txt +++ b/examples/mamba/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 transformers>=4.39.0 datasets~=2.14.5 evaluate diff --git a/examples/medusa/requirements.txt b/examples/medusa/requirements.txt index d6aece6754..5621123750 100644 --- a/examples/medusa/requirements.txt +++ b/examples/medusa/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 rouge_score~=0.1.2 sentencepiece>=0.1.99 diff --git a/examples/mixtral/requirements.txt b/examples/mixtral/requirements.txt index 5918f86123..d8d78c6726 100644 --- a/examples/mixtral/requirements.txt +++ b/examples/mixtral/requirements.txt @@ -1,3 +1,3 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 transformers==4.38.2 accelerate==0.25.0 diff --git a/examples/mpt/requirements.txt b/examples/mpt/requirements.txt index ed02cf1ea5..e82e5403ca 100644 --- a/examples/mpt/requirements.txt +++ b/examples/mpt/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/nemotron/requirements.txt b/examples/nemotron/requirements.txt index 810efadc3f..714eba33dd 100644 --- a/examples/nemotron/requirements.txt +++ b/examples/nemotron/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 nemo-toolkit[all]==2.0.0rc1 megatron-core==0.8.0 datasets~=2.14.5 diff --git a/examples/opt/requirements.txt b/examples/opt/requirements.txt index ed02cf1ea5..e82e5403ca 100644 --- a/examples/opt/requirements.txt +++ b/examples/opt/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/phi/requirements.txt b/examples/phi/requirements.txt index c86d655add..bef827d3ca 100644 --- a/examples/phi/requirements.txt +++ b/examples/phi/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/prompt_lookup/requirements.txt b/examples/prompt_lookup/requirements.txt index 32b0788b09..654b312853 100644 --- a/examples/prompt_lookup/requirements.txt +++ b/examples/prompt_lookup/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 rouge_score~=0.1.2 sentencepiece~=0.1.99 diff --git a/examples/quantization/requirements.txt b/examples/quantization/requirements.txt index 81d80a1eda..3bc34bba56 100644 --- a/examples/quantization/requirements.txt +++ b/examples/quantization/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets>=2.14.4 nemo-toolkit[all]==2.0.0rc1 rouge_score~=0.1.2 diff --git a/examples/qwen/requirements.txt b/examples/qwen/requirements.txt index 6f015adc09..630b9bfa44 100644 --- a/examples/qwen/requirements.txt +++ b/examples/qwen/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.16.0 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/qwenvl/requirements.txt b/examples/qwenvl/requirements.txt index bd5966a120..debbcdaf91 100644 --- a/examples/qwenvl/requirements.txt +++ b/examples/qwenvl/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.16.0 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/recurrentgemma/requirements.txt b/examples/recurrentgemma/requirements.txt index 26cc3f714d..0d98bb13b2 100644 --- a/examples/recurrentgemma/requirements.txt +++ b/examples/recurrentgemma/requirements.txt @@ -1,5 +1,5 @@ -tensorrt_llm==0.17.0.post1 -git+https://github.com/google-deepmind/recurrentgemma.git +tensorrt_llm==0.18.0 +git+https://github.com/google-deepmind/recurrentgemma.git@8a32e365 flax>=0.8.2 jax~=0.4.23 orbax-checkpoint==0.5.7 diff --git a/examples/redrafter/requirements.txt b/examples/redrafter/requirements.txt index d6aece6754..5621123750 100644 --- a/examples/redrafter/requirements.txt +++ b/examples/redrafter/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.14.5 rouge_score~=0.1.2 sentencepiece>=0.1.99 diff --git a/examples/skywork/requirements.txt b/examples/skywork/requirements.txt index 8e3e9b528b..e164ec3254 100644 --- a/examples/skywork/requirements.txt +++ b/examples/skywork/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets~=2.16.1 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/smaug/requirements.txt b/examples/smaug/requirements.txt index fef42c7e41..8f8bec16d6 100644 --- a/examples/smaug/requirements.txt +++ b/examples/smaug/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/whisper/requirements.txt b/examples/whisper/requirements.txt index a1fab87089..6c82f0c0d4 100644 --- a/examples/whisper/requirements.txt +++ b/examples/whisper/requirements.txt @@ -1,4 +1,4 @@ -tensorrt_llm==0.17.0.post1 +tensorrt_llm==0.18.0 tiktoken datasets kaldialign diff --git a/requirements.txt b/requirements.txt index 2ebec77668..d764bbeb20 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,11 +17,11 @@ pandas h5py==3.12.1 StrEnum sentencepiece>=0.1.99 -tensorrt~=10.8.0 -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-01.html#rel-25-01 uses 2.6.0a0. -torch>=2.5.1,<=2.6.0a0 +tensorrt~=10.9.0 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html#rel-25-03 uses 2.7.0a0. +torch>=2.6.0,<=2.7.0a0 torchvision -nvidia-modelopt[torch]~=0.23.0 +nvidia-modelopt[torch]~=0.25.0 nvidia-nccl-cu12 nvidia-cuda-nvrtc-cu12 # TODO: Remove the <4.48.0 constraint once https://github.com/huggingface/transformers/issues/35625 is resolved. @@ -41,5 +41,5 @@ uvicorn httpx setuptools ordered-set -flashinfer @ git+https://github.com/flashinfer-ai/flashinfer.git@06309c4e -s2wrapper @ git+https://github.com/bfshi/scaling_on_scales.git@60da2afe +einops +flashinfer-python~=0.2.0 diff --git a/tensorrt_llm/_torch/attention_backend/flashinfer.py b/tensorrt_llm/_torch/attention_backend/flashinfer.py index 78f7104417..aeeee513c0 100644 --- a/tensorrt_llm/_torch/attention_backend/flashinfer.py +++ b/tensorrt_llm/_torch/attention_backend/flashinfer.py @@ -8,7 +8,8 @@ import torch from flashinfer.jit.core import check_cuda_arch from tensorrt_llm._torch.attention_backend.interface import ( - AttentionBackend, AttentionMask, AttentionMetadata, PredefinedAttentionMask) + AttentionBackend, AttentionMask, AttentionMetadata, PredefinedAttentionMask, + dummy_forward) from tensorrt_llm.functional import AttentionMaskType from tensorrt_llm.models.modeling_utils import QuantConfig @@ -420,123 +421,6 @@ class FlashInferAttention(AttentionBackend[FlashInferAttentionMetadata]): if quant_mode.has_fp8_kv_cache(): self.has_fp8_kv_cache = True - @torch.library.custom_op("trtllm::flashinfer_forward", mutates_args=()) - @staticmethod - def forward_pattern( - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - num_heads: int, - head_dim: int, - num_kv_heads: int, - layer_idx: int, - has_fp8_kv_cache: bool, - attention_mask_type: int, - attention_mask_data: Optional[torch.Tensor], - ) -> torch.Tensor: - ''' - Wrapping the flashinfer forward as a custom op is required to fix `torch.compile` graph breaks, - otherwise it will graph break when calling `metadata.num_contexts` since it convert tensor's sum directly to int. - ''' - # torch.compile does not support custom object as arguments, so we have to use global function to get the metadata. - metadata = get_metadata() - - # This is only for memory estimation for now. - # NOTE: this method is not accurate while it works for most scenario. - if metadata is None or metadata.kv_cache_manager is None: - q = q.view(1, -1, num_heads, head_dim) - k = k.view(1, -1, num_kv_heads, head_dim) - v = v.view(1, -1, num_kv_heads, head_dim) - return AttentionBackend.dummy_forward(q, k, v) - - assert isinstance( - metadata, - FlashInferAttentionMetadata, - ) - - # Query - q = q.view(-1, num_heads, head_dim) - - # Key and Value - kv_cache = metadata.kv_cache_manager.get_buffers(layer_idx) - - if k is not None and v is not None: - k = k.view(-1, num_kv_heads, head_dim) - v = v.view(-1, num_kv_heads, head_dim) - - if has_fp8_kv_cache: - assert kv_cache.dtype == torch.float8_e4m3fn, f"KV cache should have fp8 dtype, but get {kv_cache.dtype}" - k = k.to(torch.float8_e4m3fn) - v = v.to(torch.float8_e4m3fn) - assert k.dtype == v.dtype == kv_cache.dtype, f"KV cache dtype {kv_cache.dtype} does not match k/v dtype {k.dtype}/{v.dtype}" - - flashinfer.page.append_paged_kv_cache( - append_key=k, - append_value=v, - batch_indices=metadata.batch_indices, - positions=metadata.positions, - paged_kv_cache=kv_cache, - kv_indices=metadata.paged_kv_indices, - kv_indptr=metadata.paged_kv_indptr, - kv_last_page_len=metadata.paged_kv_last_page_len, - kv_layout=metadata.kv_layout) - - num_contexts = metadata.num_contexts - num_generations = metadata.num_generations - num_ctx_tokens = metadata.num_ctx_tokens - - def prefill_forward(plan_params: PlanParams): - wrapper = metadata.get_prefill_wrapper(plan_params) - output = wrapper.run(q[:num_ctx_tokens], kv_cache) - output = output.view(num_ctx_tokens, -1) - return output - - def decode_forward(plan_params: PlanParams): - wrapper = metadata.get_decode_wrapper(plan_params) - output = wrapper.run(q[num_ctx_tokens:], kv_cache) - output = output.view(num_generations, -1) - return output - - # this will do nothing if the last forward pass had the same parameters - plan_params = metadata.plan(num_heads, - num_kv_heads, - head_dim, - q_dtype=q.dtype, - kv_dtype=kv_cache.dtype, - attention_mask_type=attention_mask_type, - attention_mask_data=attention_mask_data) - - if num_contexts > 0: - ctx_output = prefill_forward(plan_params) - - if num_generations > 0: - gen_output = decode_forward(plan_params) - - if num_contexts > 0 and num_generations > 0: - output = torch.cat([ctx_output, gen_output], dim=0) - elif num_contexts > 0: - output = ctx_output - elif num_generations > 0: - output = gen_output - - return output - - @forward_pattern.register_fake - @staticmethod - def _( - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - num_heads: int, - head_dim: int, - num_kv_heads: int, - layer_idx: int, - has_fp8_kv_cache: bool, - attention_mask_type: int, - attention_mask_data: Optional[torch.Tensor], - ): - return torch.empty_like(q) - def forward(self, q: torch.Tensor, k: Optional[torch.Tensor], @@ -554,7 +438,125 @@ class FlashInferAttention(AttentionBackend[FlashInferAttentionMetadata]): else: raise ValueError("Unexpected attention mask type") - return FlashInferAttention.forward_pattern( - q, k, v, self.num_heads, self.head_dim, self.num_kv_heads, - self.layer_idx, self.has_fp8_kv_cache, attention_mask_type, - attention_mask_data) + return forward_pattern(q, k, v, self.num_heads, self.head_dim, + self.num_kv_heads, self.layer_idx, + self.has_fp8_kv_cache, attention_mask_type, + attention_mask_data) + + +@torch.library.custom_op("trtllm::flashinfer_forward", mutates_args=()) +def forward_pattern( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + num_heads: int, + head_dim: int, + num_kv_heads: int, + layer_idx: int, + has_fp8_kv_cache: bool, + attention_mask_type: int, + attention_mask_data: Optional[torch.Tensor], +) -> torch.Tensor: + ''' + Wrapping the flashinfer forward as a custom op is required to fix `torch.compile` graph breaks, + otherwise it will graph break when calling `metadata.num_contexts` since it convert tensor's sum directly to int. + ''' + # torch.compile does not support custom object as arguments, so we have to use global function to get the metadata. + metadata = get_metadata() + + # This is only for memory estimation for now. + # NOTE: this method is not accurate while it works for most scenario. + if metadata is None or metadata.kv_cache_manager is None: + q = q.view(1, -1, num_heads, head_dim) + k = k.view(1, -1, num_kv_heads, head_dim) + v = v.view(1, -1, num_kv_heads, head_dim) + return dummy_forward(q, k, v) + + assert isinstance( + metadata, + FlashInferAttentionMetadata, + ) + + # Query + q = q.view(-1, num_heads, head_dim) + + # Key and Value + kv_cache = metadata.kv_cache_manager.get_buffers(layer_idx) + + if k is not None and v is not None: + k = k.view(-1, num_kv_heads, head_dim) + v = v.view(-1, num_kv_heads, head_dim) + + if has_fp8_kv_cache: + assert kv_cache.dtype == torch.float8_e4m3fn, f"KV cache should have fp8 dtype, but get {kv_cache.dtype}" + k = k.to(torch.float8_e4m3fn) + v = v.to(torch.float8_e4m3fn) + assert k.dtype == v.dtype == kv_cache.dtype, f"KV cache dtype {kv_cache.dtype} does not match k/v dtype {k.dtype}/{v.dtype}" + + flashinfer.page.append_paged_kv_cache( + append_key=k, + append_value=v, + batch_indices=metadata.batch_indices, + positions=metadata.positions, + paged_kv_cache=kv_cache, + kv_indices=metadata.paged_kv_indices, + kv_indptr=metadata.paged_kv_indptr, + kv_last_page_len=metadata.paged_kv_last_page_len, + kv_layout=metadata.kv_layout) + + num_contexts = metadata.num_contexts + num_generations = metadata.num_generations + num_ctx_tokens = metadata.num_ctx_tokens + + def prefill_forward(plan_params: PlanParams): + wrapper = metadata.get_prefill_wrapper(plan_params) + output = wrapper.run(q[:num_ctx_tokens], kv_cache) + output = output.view(num_ctx_tokens, -1) + return output + + def decode_forward(plan_params: PlanParams): + wrapper = metadata.get_decode_wrapper(plan_params) + output = wrapper.run(q[num_ctx_tokens:], kv_cache) + output = output.view(num_generations, -1) + return output + + # this will do nothing if the last forward pass had the same parameters + plan_params = metadata.plan(num_heads, + num_kv_heads, + head_dim, + q_dtype=q.dtype, + kv_dtype=kv_cache.dtype, + attention_mask_type=attention_mask_type, + attention_mask_data=attention_mask_data) + + if num_contexts > 0: + ctx_output = prefill_forward(plan_params) + + if num_generations > 0: + gen_output = decode_forward(plan_params) + + if num_contexts > 0 and num_generations > 0: + output = torch.cat([ctx_output, gen_output], dim=0) + elif num_contexts > 0: + output = ctx_output + elif num_generations > 0: + output = gen_output + + return output + + +@forward_pattern.register_fake +@staticmethod +def _( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + num_heads: int, + head_dim: int, + num_kv_heads: int, + layer_idx: int, + has_fp8_kv_cache: bool, + attention_mask_type: int, + attention_mask_data: Optional[torch.Tensor], +): + return torch.empty_like(q) diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py index 5e55bc582c..ee18a37e92 100644 --- a/tensorrt_llm/_torch/attention_backend/interface.py +++ b/tensorrt_llm/_torch/attention_backend/interface.py @@ -412,34 +412,35 @@ class AttentionBackend(Generic[TMetadata]): """ raise NotImplementedError - @torch.library.custom_op("trtllm::attn_dummy_fwd", mutates_args=()) - @staticmethod - def dummy_forward(q: torch.Tensor, k: torch.Tensor, - v: torch.Tensor) -> torch.Tensor: - """ - Dummy attention forward function to estimate memory usage. - Args: - q (torch.Tensor): Query tensor with shape (1, num_q_tokens, num_heads, head_dim),. - k (torch.Tensor): Key tensor with shape (1, num_new_kv_tokens, num_kv_heads, head_dim) - v (torch.Tensor): Value tensor with shape (1, num_new_kv_tokens, num_kv_heads, head_dim) - Returns: - torch.Tensor with shape (num_q_tokens, num_heads * head_dim) - """ - head_dim = q.shape[3] - assert q.dim() == 4 and q.size()[0] == 1 - assert k.dim() == 4 and k.size()[0] == 1 and k.size()[3] == head_dim - assert v.dim() == 4 and v.size()[0] == 1 and v.size()[3] == head_dim - # This is only for memory estimation for now. - # NOTE: this method is not accurate while it works for most scenario. - o = _flash_attention_forward(q, - k, - v, - attention_mask=None, - query_length=q.size(1), - is_causal=True) - return o.reshape(o.size(1), -1) - @dummy_forward.register_fake - def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor: - num_q_tokens = q.size()[1] - return torch.empty_like(q).reshape(num_q_tokens, -1) +@torch.library.custom_op("trtllm::attn_dummy_fwd", mutates_args=()) +def dummy_forward(q: torch.Tensor, k: torch.Tensor, + v: torch.Tensor) -> torch.Tensor: + """ + Dummy attention forward function to estimate memory usage. + Args: + q (torch.Tensor): Query tensor with shape (1, num_q_tokens, num_heads, head_dim),. + k (torch.Tensor): Key tensor with shape (1, num_new_kv_tokens, num_kv_heads, head_dim) + v (torch.Tensor): Value tensor with shape (1, num_new_kv_tokens, num_kv_heads, head_dim) + Returns: + torch.Tensor with shape (num_q_tokens, num_heads * head_dim) + """ + head_dim = q.shape[3] + assert q.dim() == 4 and q.size()[0] == 1 + assert k.dim() == 4 and k.size()[0] == 1 and k.size()[3] == head_dim + assert v.dim() == 4 and v.size()[0] == 1 and v.size()[3] == head_dim + # This is only for memory estimation for now. + # NOTE: this method is not accurate while it works for most scenario. + o = _flash_attention_forward(q, + k, + v, + attention_mask=None, + query_length=q.size(1), + is_causal=True) + return o.reshape(o.size(1), -1) + + +@dummy_forward.register_fake +def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor: + num_q_tokens = q.size()[1] + return torch.empty_like(q).reshape(num_q_tokens, -1) diff --git a/tensorrt_llm/_torch/attention_backend/star_flashinfer.py b/tensorrt_llm/_torch/attention_backend/star_flashinfer.py index b697a8994c..ae9d17470e 100644 --- a/tensorrt_llm/_torch/attention_backend/star_flashinfer.py +++ b/tensorrt_llm/_torch/attention_backend/star_flashinfer.py @@ -4,6 +4,7 @@ import torch from ..distributed import allgather from ..modules.linear import ParallelConfig from .flashinfer import * +from .interface import dummy_forward # Please sync with flashinfer's DISPATCH_GQA_GROUP_SIZE in include/flashinfer/utils.cuh @@ -320,9 +321,7 @@ class StarAttention(AttentionBackend[StarAttentionMetadata]): # This is only for memory estimation for now. # NOTE: this method is not accurate while it works for most scenario. if metadata is None or metadata.kv_cache_manager is None: - return AttentionBackend.dummy_forward(q.unsqueeze(0), - k.unsqueeze(0), - v.unsqueeze(0)) + return dummy_forward(q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)) num_contexts = metadata.num_contexts num_queries = metadata.num_queries diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py index a7ad5a9879..65a2a98e97 100644 --- a/tensorrt_llm/_torch/attention_backend/trtllm.py +++ b/tensorrt_llm/_torch/attention_backend/trtllm.py @@ -10,6 +10,8 @@ from tensorrt_llm.functional import AttentionMaskType from tensorrt_llm.logger import logger from tensorrt_llm.models.modeling_utils import QuantConfig +from .interface import dummy_forward + @dataclass(kw_only=True, init=False) class TrtllmAttentionWrapper: @@ -432,7 +434,7 @@ class TrtllmAttention(AttentionBackend[TrtllmAttentionMetadata]): q = q.reshape(1, -1, num_heads, head_dim).contiguous() k = k.reshape(1, -1, num_kv_heads, head_dim).contiguous() v = v.reshape(1, -1, num_kv_heads, head_dim).contiguous() - return AttentionBackend.dummy_forward(q, k, v) + return dummy_forward(q, k, v) assert isinstance( metadata, diff --git a/tensorrt_llm/_torch/attention_backend/vanilla.py b/tensorrt_llm/_torch/attention_backend/vanilla.py index c0e0643a44..c343f2e7bd 100644 --- a/tensorrt_llm/_torch/attention_backend/vanilla.py +++ b/tensorrt_llm/_torch/attention_backend/vanilla.py @@ -10,7 +10,7 @@ except ImportError: AttentionMaskConverter = None from .interface import (AttentionBackend, AttentionMask, AttentionMetadata, - PredefinedAttentionMask) + PredefinedAttentionMask, dummy_forward) def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: @@ -160,9 +160,7 @@ class VanillaAttention(AttentionBackend[VanillaAttentionMetadata]): # This is only for memory estimation for now. # NOTE: this method is not accurate while it works for most scenario. if metadata is None or metadata.kv_cache_manager is None: - return AttentionBackend.dummy_forward(q.unsqueeze(0), - k.unsqueeze(0), - v.unsqueeze(0)) + return dummy_forward(q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)) past_seen_tokens = metadata.kv_cache_params.num_cached_tokens_per_seq cache_indices = [ diff --git a/tensorrt_llm/_torch/models/modeling_multimodal_utils.py b/tensorrt_llm/_torch/models/modeling_multimodal_utils.py new file mode 100644 index 0000000000..f6c2ac3499 --- /dev/null +++ b/tensorrt_llm/_torch/models/modeling_multimodal_utils.py @@ -0,0 +1,149 @@ +# Copyright 2024 NVIDIA CORPORATION & AFFILIATES +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 +# This file is based on official VILA: https://github.com/NVlabs/VILA/ +# and s2wrapper: https://github.com/bfshi/scaling_on_scales + +import math + +import torch +import torch.nn.functional as F +from einops import rearrange + +# ------------------------------------------------------------------------------------------ +# Original code by Baifeng Shi, licensed under the MIT License: +# https://github.com/bfshi/scaling_on_scales/blob/master/LICENSE.md +# ------------------------------------------------------------------------------------------ + + +def s2_split_chessboard(x, num_split): + """ + x: b * c * h * w + Deividing x into num_split**2 sub-squares, and concatenate all the sub-squares on the batch dimension + """ + B, C, H, W = x.shape + assert H % num_split == 0 and W % num_split == 0 + x_split = rearrange(x, + 'b c (nh h) (nw w) -> (nh nw b) c h w', + nh=num_split, + nw=num_split) + return x_split + + +def s2_merge_chessboard(x, num_split): + """ + x: b * c * h * w + Assuming x contains num_split**2 sub-squares concatenated along batch dimension, merge the sub-squares back to the original whole square. + (inverse of split_chessboard) + """ + B, C, H, W = x.shape + assert B % (num_split**2) == 0 + x_merge = rearrange(x, + '(nh nw b) c h w -> b c (nh h) (nw w)', + nh=num_split, + nw=num_split) + + return x_merge + + +def s2_batched_forward(model, x, batch_size=-1): + if batch_size == -1: + return model(x) + else: + x_batched = x.split(batch_size) + outs = [model(x) for x in x_batched] + return torch.cat(outs, dim=0) + + +def multiscale_forward(model, + input, + scales=None, + img_sizes=None, + max_split_size=None, + resize_output_to_idx=0, + num_prefix_token=0, + output_shape='bnc', + split_forward=False): + + assert input.dim() == 4, "Input image must be in the shape of BxCxHxW." + assert input.shape[2] == input.shape[ + 3], "Currently only square images are supported." + assert output_shape in [ + 'bnc', 'bchw' + ], "Output shape should be either BxNxC (e.g., ViT) or BxCxHxW (e.g., ConvNet)." + assert output_shape == 'bnc' or num_prefix_token == 0, "For ConvNet there shouldn't be any prefix token." + + b, c, input_size, _ = input.shape + + # image size for each scale + assert scales is not None or img_sizes is not None, "Please assign either scales or img_sizes." + img_sizes = img_sizes or [int(input_size * scale) for scale in scales] + + # prepare multiscale inputs + max_split_size = max_split_size or input_size # The maximum size of each split of image. Set as the input size by default + num_splits = [math.ceil(size / max_split_size) + for size in img_sizes] # number of splits each scale + input_multiscale = [] + for size, num_split in zip(img_sizes, num_splits): + x = F.interpolate(input.to(torch.float32), size=size, + mode='bicubic').to(input.dtype) + x = s2_split_chessboard(x, num_split=num_split) + input_multiscale.append(x) + + # run feedforward on each scale + outs_multiscale = [ + s2_batched_forward(model, x, b) if split_forward else model(x) + for x in input_multiscale + ] + if num_prefix_token > 0: + outs_prefix_multiscale = [ + out[:, :num_prefix_token] for out in outs_multiscale + ] + outs_multiscale = [out[:, num_prefix_token:] for out in outs_multiscale] + if output_shape == 'bnc': + outs_multiscale = [ + rearrange(out, + 'b (h w) c -> b c h w', + h=int(out.shape[1]**0.5), + w=int(out.shape[1]**0.5)) for out in outs_multiscale + ] + + # merge outputs of different splits for each scale separately + outs_multiscale = [ + s2_merge_chessboard(out, num_split=num_split) + for num_split, out in zip(num_splits, outs_multiscale) + ] + + # interpolate outputs from different scales and concat together + output_size = outs_multiscale[resize_output_to_idx].shape[-2] + out = torch.cat([ + F.interpolate(outs_multiscale[i].to(torch.float32), + size=output_size, + mode='area').to(outs_multiscale[i].dtype) + for i in range(len(outs_multiscale)) + ], + dim=1) + if output_shape == 'bnc': + out = rearrange(out, 'b c h w -> b (h w) c') + if num_prefix_token > 0: + # take the mean of prefix tokens from different splits for each scale + outs_prefix_multiscale = [ + torch.stack(out.split(b, dim=0), dim=0).mean(dim=0) + for out in outs_prefix_multiscale + ] + out_prefix_multiscale = torch.cat(outs_prefix_multiscale, dim=-1) + out = torch.cat([out_prefix_multiscale, out], dim=1) + + return out diff --git a/tensorrt_llm/_torch/models/modeling_vit.py b/tensorrt_llm/_torch/models/modeling_vit.py index eee67edcfd..c6b634eeab 100644 --- a/tensorrt_llm/_torch/models/modeling_vit.py +++ b/tensorrt_llm/_torch/models/modeling_vit.py @@ -19,12 +19,13 @@ import torch import torch.nn as nn from accelerate.hooks import add_hook_to_module -from s2wrapper import forward as multiscale_forward from transformers import (CLIPImageProcessor, CLIPVisionModel, PretrainedConfig, PreTrainedModel) from transformers.image_processing_utils import BaseImageProcessor from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled +from .modeling_multimodal_utils import multiscale_forward + class VisionTower(nn.Module): diff --git a/tensorrt_llm/llmapi/_perf_evaluator.py b/tensorrt_llm/llmapi/_perf_evaluator.py index aa048524ac..ddb341204a 100644 --- a/tensorrt_llm/llmapi/_perf_evaluator.py +++ b/tensorrt_llm/llmapi/_perf_evaluator.py @@ -366,8 +366,8 @@ class LLMPerfEvaluator: sample = self.samples[sample_offset] sample_offset += 1 sampling_params.max_tokens = sample.output_len - sampling_params.end_id = -2 - sampling_params.pad_id = -2 + sampling_params.end_id = -1 + sampling_params.pad_id = -1 if self.sampling_extra_params is not None: for key, value in self.sampling_extra_params.items(): setattr(sampling_params, key, value) diff --git a/tensorrt_llm/runtime/multimodal_model_runner.py b/tensorrt_llm/runtime/multimodal_model_runner.py index 7723520395..37f965ac4a 100644 --- a/tensorrt_llm/runtime/multimodal_model_runner.py +++ b/tensorrt_llm/runtime/multimodal_model_runner.py @@ -643,12 +643,18 @@ class MultimodalModelRunner: other_vision_inputs): # same prompt for single/multiple image(s) n_prompts_n_images = False - if isinstance( - post_prompt, - list) and len(post_prompt) > 1 and image is not None and len( - post_prompt) == image.shape[0]: - # n prompts and n images - n_prompts_n_images = True + if isinstance(post_prompt, + list) and len(post_prompt) > 1 and image is not None: + if hasattr(image, "pixel_values"): + if len(post_prompt) == image["pixel_values"].shape[0]: + n_prompts_n_images = True + # n prompts and n images + else: + if isinstance( + image, + torch.Tensor) and len(post_prompt) == image.shape[0]: + n_prompts_n_images = True + # n prompts and n images if self.model_type == 'kosmos-2': input_ids = image['input_ids'].clone() @@ -867,7 +873,14 @@ class MultimodalModelRunner: 1] + visual_atts.shape[1] else: post_input_ids = None - length = pre_input_ids.shape[1] + visual_atts.shape[1] + assert pre_input_ids.shape[0] == visual_atts.shape[0] + if visual_atts.shape[0] == 1: + length = pre_input_ids.shape[1] + visual_atts.shape[1] + else: + length = [ + pre_input_ids.shape[1] + visual_atts.shape[1] + for _ in range(visual_atts.shape[0]) + ] if n_prompts_n_images: if isinstance(length, int): length = [length] diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py index 7c13fa986e..2b89953871 100644 --- a/tensorrt_llm/version.py +++ b/tensorrt_llm/version.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.17.0.post1" +__version__ = "0.18.0" diff --git a/tests/_torch/test_fp4_gemm_quantize.py b/tests/_torch/test_fp4_gemm_quantize.py index 58d6210bd9..ac47c58e26 100644 --- a/tests/_torch/test_fp4_gemm_quantize.py +++ b/tests/_torch/test_fp4_gemm_quantize.py @@ -17,6 +17,7 @@ import os import sys import unittest +import pytest import torch from parameterized import parameterized @@ -56,6 +57,7 @@ class TestFunctional(unittest.TestCase): ) @skip_pre_blackwell_unittest def test_fp4_quantize_gemm_torch(self, m, n, k): + pytest.skip("https://nvbugs/5100633") a = torch.randn([m, k], dtype=torch.float32) b = torch.randn([n, k], dtype=torch.float32) a_global_sf = (448 * 6) / a.abs().max().float() diff --git a/tests/llmapi/test_llm_models.py b/tests/llmapi/test_llm_models.py index bc63cfda9a..559b365625 100644 --- a/tests/llmapi/test_llm_models.py +++ b/tests/llmapi/test_llm_models.py @@ -37,7 +37,7 @@ gpt_neox_20b_model_path = get_model_path('gpt-neox-20b') commandr_v01_model_path = get_model_path('c4ai-command-r-v01') commandr_plus_model_path = get_model_path('c4ai-command-r-plus') deepseek_v1_model_path = get_model_path("deepseek-moe-16b-base") -sampling_params = SamplingParams(max_tokens=10) +sampling_params = SamplingParams(max_tokens=10, end_id=-1) @force_ampere