Update TensorRT-LLM (#2215)

* Update TensorRT-LLM --------- Co-authored-by: Sherlock Xu <65327072+Sherlock113@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2024-09-10 18:21:22 +08:00 · 2024-09-10 18:21:22 +08:00 · 31ac30e928
commit 31ac30e928
parent 78f5c2936b
272 changed files with 764351 additions and 84677 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -11,3 +11,6 @@
 [submodule "3rdparty/NVTX"]
 	path = 3rdparty/NVTX
 	url = https://github.com/NVIDIA/NVTX.git
+[submodule "3rdparty/ucxx"]
+	path = 3rdparty/ucxx
+	url = https://github.com/GuanLuo/ucxx.git
--- a/3rdparty/ucxx
+++ b/3rdparty/ucxx
@ -0,0 +1 @@
+Subproject commit b99181779672965c6f325a95a29eb433b6e9cbbd
--- a/README.md
+++ b/README.md
@ -17,6 +17,9 @@ TensorRT-LLM
 <div align="left">

 ## Latest News
+* [2024/09/04] 🏎️🏎️🏎️ Best Practices for Tuning TensorRT-LLM for Optimal Serving with BentoML
+[➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml)
+
 * [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
 [➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)

--- a/benchmarks/Suite.md
+++ b/benchmarks/Suite.md
@ -160,6 +160,10 @@ or manually set a max sequence length that you plan to run with specifically:
 trtllm-bench --model meta-llama/Llama-2-7b-hf build --max_seq_len 256 --quantization FP8
 ```

+> [!NOTE] `trtllm-bench build` reproduces benchmark engines for performance study. These engine
+configurations are not guaranteed to be optimal for all cases and should be viewed as reproducers
+for the benchmark data we provide on our [Performance Overview](../docs/source/performance/perf-overview.md).
+
 Looking a little closer, the `build` sub-command
 will perform a lookup and build an engine using those reference settings. The
 look up table directly corresponds to the performance table found in our
--- a/benchmarks/cpp/gptManagerBenchmark.cpp
+++ b/benchmarks/cpp/gptManagerBenchmark.cpp
@ -157,7 +157,7 @@ struct BenchmarkParams
    int randomSeed = 430;
    std::optional<std::vector<int>> maxAttentionWindowVec{std::nullopt};
    std::optional<int> sinkTokenLength{std::nullopt};
-    bool multiBlockMode{false};
+    bool multiBlockMode{true};
    bool enableContextFMHAFP32Acc{false};

    // lora / peft params
@ -1943,7 +1943,7 @@ int main(int argc, char* argv[])

    options.add_options()("multi_block_mode",
        "Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel",
-        cxxopts::value<bool>()->default_value("false"));
+        cxxopts::value<bool>()->default_value("true"));
    options.add_options()(
        "encoder_engine_dir", "Directory that store the engines of the encoder models.", cxxopts::value<std::string>());

--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@ -381,7 +381,7 @@ endif()
 # set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G")

 set(CMAKE_CXX_FLAGS
-    "${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE}"
+    "${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE} -DENABLE_UCX=${ENABLE_UCX}"
 )

 # Fix linking issue with TRT 10, the detailed description about `--mcmodel` can
@ -538,6 +538,30 @@ elseif(NOT WIN32)
  message(STATUS "Build without PyTorch, USE_CXX11_ABI=${USE_CXX11_ABI}")
 endif()

+# Defer UCX/UCXX setup until after USE_CXX11_ABI is well defined, as UCXX will
+# need to be built to have aligned symbols
+set_ifndef(ENABLE_UCX 0)
+if(ENABLE_UCX)
+  # Only enable UCX related features if the system has UCX library
+  find_package(ucx)
+  if(NOT ${ucx_FOUND})
+    set(ENABLE_UCX 0)
+  else()
+    # installing ucxx via add_subdirectory results in strange cudart linking
+    # error, thus using their installation script to isolate the installation
+    # process until the issue is understood. And always trigger the build so
+    # that change in USE_CXX11_ABI will not be ignored.
+    execute_process(
+      COMMAND
+        ${3RDPARTY_DIR}/ucxx/build.sh libucxx -n
+        --cmake-args=\"-DBUILD_SHARED_LIBS=OFF
+        -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=${USE_CXX11_ABI}\"
+        COMMAND_ECHO STDOUT)
+    find_package(ucxx REQUIRED PATHS ${3RDPARTY_DIR}/ucxx/cpp/build
+                 NO_DEFAULT_PATH)
+  endif()
+endif()
+
 file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS
     REGEX "#define NV_TENSORRT_.*")
 foreach(TYPE MAJOR MINOR PATCH BUILD)
--- a/cpp/include/tensorrt_llm/batch_manager/inferenceRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/inferenceRequest.h
@ -41,6 +41,7 @@ auto constexpr kDraftInputIdsTensorName = "draft_input_ids";
 auto constexpr kDraftLogitsTensorName = "draft_logits";
 auto constexpr kMaxNewTokensTensorName = "request_output_len";
 auto constexpr kBeamWidthTensorName = "beam_width";
+auto constexpr kNumReturnSequencesTensorName = "num_return_sequences";
 auto constexpr kEndIdTensorName = "end_id";
 auto constexpr kPadIdTensorName = "pad_id";
 auto constexpr kBadWordsListTensorName = "bad_words_list";
@ -194,6 +195,7 @@ public:
        inference_request::kDraftLogitsTensorName,
        inference_request::kMaxNewTokensTensorName,
        inference_request::kBeamWidthTensorName,
+        inference_request::kNumReturnSequencesTensorName,
        inference_request::kEndIdTensorName,
        inference_request::kPadIdTensorName,
        inference_request::kBadWordsListTensorName,
@ -263,6 +265,7 @@ public:
    TENSOR_GETTER_SETTER(DraftLogits, inference_request::kDraftLogitsTensorName)
    TENSOR_GETTER_SETTER(MaxNewTokens, inference_request::kMaxNewTokensTensorName)
    TENSOR_GETTER_SETTER(BeamWidth, inference_request::kBeamWidthTensorName)
+    TENSOR_GETTER_SETTER(NumReturnSequences, inference_request::kNumReturnSequencesTensorName)
    TENSOR_GETTER_SETTER(EndId, inference_request::kEndIdTensorName)
    TENSOR_GETTER_SETTER(PadId, inference_request::kPadIdTensorName)
    TENSOR_GETTER_SETTER(BadWordsList, inference_request::kBadWordsListTensorName)
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@ -85,6 +85,7 @@ public:
    using TensorPtr = TTensor;
    using LogitsPostProcessor = std::function<void(
        RequestIdType, TensorPtr&, BeamTokens const&, TStream const&, std::optional<RequestIdType>)>;
+    using RequestPtr = std::shared_ptr<GenericLlmRequest>;

    GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> inputTokens,
        runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
@ -107,7 +108,8 @@ public:
        std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
        std::optional<SizeType32> encoderOutputLength = std::nullopt,
        LlmRequestType llmRequestType = LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
-        std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt)
+        std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt,
+        SizeType32 numReturnSequences = 1)
        : mRequestId(requestId)
        , mPromptLen(inputTokens->size())
        , mMaxNewTokens(maxNewTokens)
@ -152,11 +154,14 @@ public:
        , mEncoderOutputLength(encoderOutputLength)
        , mLlmRequestType(llmRequestType)
        , mInputTokenExtraIds(std::move(inputTokenExtraIds))
+        , mNumReturnSequences(numReturnSequences)
+        , mSequenceIndex(0)
    {
        if (mEncoderTokens.has_value() || encoderInputFeatures.has_value())
        {
            mState = REQUEST_STATE_ENCODER_INIT;
        }
+
        initialize(*inputTokens, returnLogProbs);
    }

@ -202,6 +207,8 @@ public:
        , mEncoderOutputLength(req.getEncoderOutputLength())
        , mContextPhaseParams(req.getContextPhaseParams())
        , mInputTokenExtraIds(std::nullopt)
+        , mNumReturnSequences(req.getNumReturnSequences())
+        , mSequenceIndex(0)
    {
        if (req.getRequestType() == executor::RequestType::REQUEST_TYPE_GENERATION_ONLY)
        {
@ -217,6 +224,7 @@ public:
                "length).");
            mReturnAllGeneratedTokens = true;
        }
+
        if (mIsStreaming && mSamplingConfig.beamWidth > 1 && mReturnGenerationLogits == true)
        {
            TLLM_LOG_WARNING(
@ -276,13 +284,15 @@ public:
            mLoraTaskId = loraConfig->getTaskId();
            if (loraConfig.value().getWeights())
            {
-                mLoraWeights = executor::detail::toITensor(loraConfig.value().getWeights().value());
+                mLoraWeights = tensorrt_llm::runtime::ITensor::view(
+                    executor::detail::toITensor(loraConfig.value().getWeights().value()));
                mLoraWeights.value()->unsqueeze(0);
            }

            if (loraConfig.value().getConfig())
            {
-                mLoraConfig = executor::detail::toITensor(loraConfig.value().getConfig().value());
+                mLoraConfig = tensorrt_llm::runtime::ITensor::view(
+                    executor::detail::toITensor(loraConfig.value().getConfig().value()));
                mLoraConfig.value()->unsqueeze(0);
            }
        }
@ -429,6 +439,20 @@ public:
        return mTokens.at(beam).size() - mNumPreDecodedTokens[beam];
    }

+    /// @brief Get number of return sequences for this req.
+    /// @return  The number of sequences to return.
+    [[nodiscard]] SizeType32 getNumReturnSequences() const
+    {
+        return mNumReturnSequences;
+    }
+
+    /// @brief Get child requests spawned by this req.
+    /// @return A vector of child requests.
+    [[nodiscard]] std::vector<RequestPtr> const& getChildRequests() const
+    {
+        return mChildRequests;
+    }
+
    /// @brief Get max number of tokens across all beams
    /// @return  The number of tokens
    [[nodiscard]] SizeType32 getMaxBeamNumTokens() const
@ -618,6 +642,25 @@ public:
        }
    }

+    /// @brief Sets the number of return sequences.
+    /// @param numReturnSequences The number of return sequences.
+    void setNumReturnSequences(SizeType32 const& numReturnSequences)
+    {
+        TLLM_CHECK_WITH_INFO(!isChild(), "A child request cannot change numReturnSequences.");
+        TLLM_CHECK_WITH_INFO(
+            numReturnSequences > 0, "numReturnSequences should be a positive integer, got %d.", numReturnSequences);
+        TLLM_CHECK_WITH_INFO(mChildRequests.size() <= static_cast<size_t>(numReturnSequences),
+            "Cannot set numReturnSequences %d smaller than the number %ld of child requests that have already created.",
+            numReturnSequences, mChildRequests.size());
+        mNumReturnSequences = numReturnSequences;
+        mSequenceFinalVec->resize(mNumReturnSequences);
+    }
+
+    [[nodiscard]] bool constexpr isChild() const noexcept
+    {
+        return mSequenceIndex > 0;
+    }
+
    /// @brief Return a vector of the last-generated tokens of shape [num_beams]
    [[nodiscard]] VecTokens const& getLastTokens()
    {
@ -886,6 +929,11 @@ public:
        mEncoderOutputHost = std::move(encoderOutputHost);
    }

+    void setEncoderOutput(TensorPtr encoderOutput)
+    {
+        mEncoderOutput = std::move(encoderOutput);
+    }
+
    void allocEncoderOutputHost(SizeType32 encoderHiddenSize, nvinfer1::DataType dataType)
    {
        mEncoderOutputHost = runtime::BufferManager::pinned(
@ -1204,7 +1252,14 @@ public:
            TLLM_LOG_DEBUG("Creating response for request %lu", mRequestId);

            executor::Result result;
-            result.isFinal = isGenerationCompleteState() || isDisaggContextTransmissionState();
+            result.sequenceIndex = mSequenceIndex;
+
+            result.isSequenceFinal = isGenerationCompleteState() || isDisaggContextTransmissionState();
+            mSequenceFinalVec->at(mSequenceIndex) = result.isSequenceFinal;
+
+            result.isFinal = std::all_of(mSequenceFinalVec->begin(), mSequenceFinalVec->end(),
+                [](bool isSequenceFinal) { return isSequenceFinal; });
+
            auto const nbBeams = mSamplingConfig.beamWidth;
            auto const maxNbTokens = getMaxBeamNumTokens();

@ -1295,7 +1350,9 @@ public:
                // Update position of last sent response
                setMaxSentTokenLen(maxNbTokens);

-                auto response = executor::Response(mRequestId, std::move(result));
+                auto requestId = isChild() ? mParentRequestId : mRequestId;
+                auto response = executor::Response(requestId, std::move(result));
+
                return response;
            }
        }
@ -1413,6 +1470,12 @@ protected:
    // TODO: add real extra id for encoder tokens
    std::optional<std::shared_ptr<VecUniqueTokens>> mEncoderUniqueTokens;

+    SizeType32 mNumReturnSequences;
+    SizeType32 mSequenceIndex;
+    std::vector<RequestPtr> mChildRequests;
+    RequestIdType mParentRequestId;
+    std::shared_ptr<std::vector<bool>> mSequenceFinalVec; // Indicators whether each sibling completes generation.
+
 private:
    void initialize(VecTokens const& inputTokens, bool outputLogProbs)
    {
@ -1475,6 +1538,12 @@ private:
        }

        setReturnLogProbs(outputLogProbs);
+
+        if (!isChild())
+        {
+            // Initialize result states unless it is a child and a child request should share parent's one.
+            mSequenceFinalVec = std::make_shared<std::vector<bool>>(getNumReturnSequences(), false);
+        }
    }

    TensorPtr createListTensor(std::list<VecTokens> const& wordsList)
@ -1540,7 +1609,8 @@ public:
        std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
        std::optional<SizeType32> encoderOutputLength = std::nullopt,
        LlmRequestType llmRequestType = LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
-        std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt)
+        std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt,
+        SizeType32 numReturnSequences = 1)
        : Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
            std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
            std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
@ -1548,18 +1618,49 @@ public:
            std::move(draftTokens), std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
            applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority,
            std::move(encoderInputFeatures), std::move(encoderOutputLength), llmRequestType,
-            std::move(inputTokenExtraIds))
+            std::move(inputTokenExtraIds), numReturnSequences)
    {
    }

-    LlmRequest(RequestIdType requestId, executor::Request const& Request,
+    LlmRequest(RequestIdType requestId, executor::Request const& request,
        std::optional<Base::LogitsPostProcessor> logitsPostProcessor = std::nullopt,
        bool applyLogitsPostProcessorBatched = false)
-        : Base(requestId, Request)
+        : Base(requestId, request)
    {
        mLogitsPostProcessor = std::move(logitsPostProcessor);
        mApplyLogitsPostProcessorBatched = applyLogitsPostProcessorBatched;
-        mLookaheadConfig = Request.getLookaheadConfig();
+        mLookaheadConfig = request.getLookaheadConfig();
+    }
+
+    std::shared_ptr<LlmRequest> createChildRequest(RequestIdType requestId)
+    {
+        TLLM_CHECK_WITH_INFO(!isChild(), "A child request cannot create its own child.");
+        TLLM_CHECK_WITH_INFO(mChildRequests.size() + 1 < static_cast<size_t>(getNumReturnSequences()),
+            "Cannot create child requests more than the number of return sequences (%d)", getNumReturnSequences());
+        auto childReq = std::make_shared<LlmRequest>(*this);
+        childReq->mRequestId = requestId;
+        childReq->mSequenceIndex = mChildRequests.size() + 1;
+        childReq->mParentRequestId = this->mRequestId;
+        childReq->mSequenceFinalVec = this->mSequenceFinalVec;
+        childReq->mSeqSlot.reset();
+
+        // To ensure different randomness across children, assign a unique random seed to each child
+        // by adding its sequence index to the base seed. If no seed is provided, the parent's seed defaults to 0.
+        using RandomSeedType = tensorrt_llm::executor::RandomSeedType;
+        if (childReq->mSamplingConfig.randomSeed.has_value())
+        {
+            childReq->mSamplingConfig.randomSeed->at(0) += static_cast<RandomSeedType>(childReq->mSequenceIndex);
+        }
+        else
+        {
+            RandomSeedType defaultSeed{0};
+            mSamplingConfig.randomSeed = std::vector<RandomSeedType>(1, defaultSeed);
+            childReq->mSamplingConfig.randomSeed
+                = std::vector<RandomSeedType>(1, defaultSeed + static_cast<RandomSeedType>(childReq->mSequenceIndex));
+        }
+
+        mChildRequests.push_back(childReq);
+        return childReq;
    }

    void movePromptEmbeddingTableToGpu(runtime::BufferManager const& manager)
--- a/cpp/include/tensorrt_llm/batch_manager/peftCacheManagerConfig.h
+++ b/cpp/include/tensorrt_llm/batch_manager/peftCacheManagerConfig.h
@ -32,7 +32,7 @@ using runtime::SizeType32;
 struct PeftCacheManagerConfig
 {

-    static float constexpr kDefaultDeviceCachePercent = 0.05;
+    static float constexpr kDefaultDeviceCachePercent = 0.02;
    static size_t constexpr kDefaultHostCacheSize = 1024 * 1024 * 1024;

    explicit PeftCacheManagerConfig(SizeType32 numHostModuleLayer = 0, SizeType32 numDeviceModuleLayer = 0,
--- a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
+++ b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
@ -78,15 +78,6 @@ public:
    {
    }

-    // Copy constructor
-    TrtGptModelOptionalParams(TrtGptModelOptionalParams const& other)
-        : TrtGptModelOptionalParams(other.kvCacheConfig, other.enableTrtOverlap, other.deviceIds,
-            other.normalizeLogProbs, other.enableChunkedContext, other.peftCacheManagerConfig, other.decodingConfig,
-            other.gpuWeightsPercent, other.maxBeamWidth, other.maxBatchSize, other.maxNumTokens, other.schedulerConfig,
-            other.extendedRuntimePerfKnobConfig, other.debugConfig, other.maxSeqIdleMicroseconds)
-    {
-    }
-
    bool operator==(TrtGptModelOptionalParams const& other) const
    {
        return kvCacheConfig == other.kvCacheConfig                                 //
--- a/cpp/include/tensorrt_llm/common/utils.h
+++ b/cpp/include/tensorrt_llm/common/utils.h
@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+
+#ifndef _WIN32
+#include <pthread.h>
+#endif
+
+namespace tensorrt_llm::common
+{
+
+inline bool setThreadName(std::string const& name)
+{
+#ifdef _WIN32
+    return false;
+#else
+    auto const ret = pthread_setname_np(pthread_self(), name.c_str());
+    return !ret;
+#endif
+}
+
+} // namespace tensorrt_llm::common
--- a/cpp/include/tensorrt_llm/executor/executor.h
+++ b/cpp/include/tensorrt_llm/executor/executor.h
@ -24,6 +24,7 @@
 #include <deque>
 #include <filesystem>
 #include <list>
+#include <map>
 #include <memory>
 #include <optional>
 #include <string>
@ -343,6 +344,7 @@ public:
    /// convolution down-sampling, etc.)
    /// @param type Indicate the request type for disaggregated serving mode.
    /// @param contextPhaseParams Generated token ID  from context only executor.
+    /// @param numReturnSequences The number of returning sequences.
    Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming = false,
        SamplingConfig const& samplingConfig = SamplingConfig(), OutputConfig const& outputConfig = OutputConfig(),
        std::optional<SizeType32> const& endId = std::nullopt, std::optional<SizeType32> const& padId = std::nullopt,
@ -360,7 +362,7 @@ public:
        RequestType type = RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION,
        std::optional<ContextPhaseParams> contextPhaseParams = std::nullopt,
        std::optional<Tensor> encoderInputFeatures = std::nullopt,
-        std::optional<SizeType32> encoderOutputLength = std::nullopt);
+        std::optional<SizeType32> encoderOutputLength = std::nullopt, SizeType32 numReturnSequences = 1);

    /// @brief This logits postprocessor name will dispatch to the batched logits postprocessor
    static auto constexpr kBatchedPostProcessorName = "batched";
@ -396,6 +398,7 @@ public:
    [[nodiscard]] std::optional<Tensor> getEncoderInputFeatures() const;
    [[nodiscard]] std::optional<SizeType32> getEncoderOutputLength() const;
    [[nodiscard]] RequestType getRequestType() const;
+    [[nodiscard]] SizeType32 getNumReturnSequences() const;

    void setStreaming(bool streaming);
    void setSamplingConfig(SamplingConfig const& config);
@ -419,6 +422,7 @@ public:
    void setContextPhaseParams(ContextPhaseParams contextPhaseParams);
    void setEncoderInputFeatures(Tensor encoderInputFeatures);
    void setEncoderOutputLength(SizeType32 encoderOutputLength);
+    void setNumReturnSequences(SizeType32 numReturnSequences);

 private:
    friend class Serialization;
@ -461,6 +465,12 @@ struct Result

    /// @brief The decoding iterations it takes.
    SizeType32 decodingIter{0};
+
+    /// @brief The index of the output sequence where 0 <= sequenceIndex < numReturnSequences
+    SizeType32 sequenceIndex{0};
+
+    /// @brief Indicates if this is the final result for a given sequence in the request
+    bool isSequenceFinal;
 };

 /// @brief Class that holds either an error or a result
@ -583,7 +593,7 @@ private:
 class ExtendedRuntimePerfKnobConfig
 {
 public:
-    explicit ExtendedRuntimePerfKnobConfig(bool multiBlockMode = false, bool enableContextFMHAFP32Acc = false);
+    explicit ExtendedRuntimePerfKnobConfig(bool multiBlockMode = true, bool enableContextFMHAFP32Acc = false);

    bool operator==(ExtendedRuntimePerfKnobConfig const& other) const
    {
@ -612,27 +622,33 @@ class DebugConfig
    using StringVec = std::vector<std::string>;

 public:
-    explicit DebugConfig(bool dumpInputTensors = false, bool dumpOuputTensors = false, StringVec debugTensorNames = {});
+    explicit DebugConfig(bool debugInputTensors = false, bool debugOutputTensors = false,
+        StringVec debugTensorNames = {}, SizeType32 debugTensorsMaxIterations = 0);

    bool operator==(DebugConfig const& other) const;

-    [[nodiscard]] bool getDumpInputTensors() const;
-    [[nodiscard]] bool getDumpOutputTensors() const;
+    [[nodiscard]] bool getDebugInputTensors() const;
+    [[nodiscard]] bool getDebugOutputTensors() const;
    [[nodiscard]] StringVec const& getDebugTensorNames() const;
+    [[nodiscard]] SizeType32 getDebugTensorsMaxIterations() const;

-    void setDumpInputTensors(bool dumpInputTensors);
-    void setDumpOuputTensors(bool dumpOuputTensors);
+    void setDebugInputTensors(bool debugInputTensors);
+    void setDebugOutputTensors(bool debugOutputTensors);
    void setDebugTensorNames(StringVec const& debugTensorNames);
+    void setDebugTensorsMaxIterations(SizeType32 debugTensorsMaxIterations);

 private:
    friend class Serialization;

-    /// @brief If true, dump all input tensors.
-    bool mDumpInputTensors;
-    /// @brief If true, dump all output tensors.
-    bool mDumpOuputTensors;
-    /// @brief If not empty, only dump tensors in this list.
+    /// @brief If true, debug all input tensors.
+    bool mDebugInputTensors;
+    /// @brief If true, debug all output tensors.
+    bool mDebugOutputTensors;
+    /// @brief If not empty, only debug tensors in this list.
    StringVec mDebugTensorNames;
+    /// @brief If > 0, provide debug tensors for at most debugTensorsMaxIterations past iterations,
+    /// else dump them to files.
+    SizeType32 mDebugTensorsMaxIterations;
 };

 SizeType32 const kDefaultIterStatsMaxIterations = 1000;
@ -960,7 +976,8 @@ public:
        ModelType modelType, ExecutorConfig const& executorConfig);

    Executor(BufferView const& engineBuffer, std::string const& jsonConfigStr, ModelType modelType,
-        ExecutorConfig const& executorConfig);
+        ExecutorConfig const& executorConfig,
+        std::optional<std::map<std::string, Tensor>> const& managedWeights = std::nullopt);

    Executor(BufferView const& encoderEngineBuffer, std::string const& encoderJsonConfigStr,
        BufferView const& decoderEngineBuffer, std::string const& decoderJsonConfigStr, ModelType modelType,
@ -1021,20 +1038,25 @@ public:
    /// @param id The request id for which to cancel the response
    void cancelRequest(IdType requestId);

-    /// @brief  Signals the server to shutdown
-    ///         This call is blocking. Only returns when all requests have terminated or timeout has been reached
+    /// @brief   Signals the server to shutdown.
+    /// @details This call is blocking. Only returns when all requests have terminated or timeout has been reached
    void shutdown();

-    /// @brief  Returns the per-iterations statistics computed since last call to getLatestIterationStats
-    ///         Contains at most iterStatsMaxIterations iterations
+    /// @brief  Returns the per-iterations statistics computed since last call to getLatestIterationStats.
+    ///         Contains at most iterStatsMaxIterations iterations.
    /// @return Iteration stats
    std::deque<IterationStats> getLatestIterationStats();

-    /// @brief  Returns the request stats of each iteration computed since last call to getLatestRequestStats
-    ///         Contains at most requestStatsMaxIterations iterations
+    /// @brief  Returns the request stats of each iteration computed since last call to getLatestRequestStats.
+    ///         Contains at most requestStatsMaxIterations iterations.
    /// @return Request stats grouped by iterations
    std::deque<RequestStatsPerIteration> getLatestRequestStats();

+    /// @brief  Returns the debug tensors of each iteration computed since last call to getLatestDebugTensors.
+    ///         Contains at most debugTensorsMaxIterations iterations.
+    /// @return Request debug tensors grouped by iterations
+    std::deque<DebugTensorsPerIteration> getLatestDebugTensors();
+
    /// @brief  Indicates if the current process is allowed to enqueueRequests
    [[nodiscard]] bool canEnqueueRequests() const;

--- a/cpp/include/tensorrt_llm/executor/serialization.h
+++ b/cpp/include/tensorrt_llm/executor/serialization.h
@ -25,6 +25,13 @@
 namespace tensorrt_llm::executor
 {

+namespace kv_cache
+{
+class CommState;
+class CacheState;
+class SocketState;
+} // namespace kv_cache
+
 class Serialization
 {
 public:
@ -53,6 +60,21 @@ public:
    static void serialize(LoraConfig const& config, std::ostream& os);
    [[nodiscard]] static size_t serializedSize(LoraConfig const& config);

+    // CommState
+    [[nodiscard]] static kv_cache::CommState deserializeCommState(std::istream& is);
+    static void serialize(kv_cache::CommState const& state, std::ostream& os);
+    [[nodiscard]] static size_t serializedSize(kv_cache::CommState const& state);
+
+    // SocketState
+    [[nodiscard]] static kv_cache::SocketState deserializeSocketState(std::istream& is);
+    static void serialize(kv_cache::SocketState const& state, std::ostream& os);
+    [[nodiscard]] static size_t serializedSize(kv_cache::SocketState const& state);
+
+    // CacheState
+    [[nodiscard]] static kv_cache::CacheState deserializeCacheState(std::istream& is);
+    static void serialize(kv_cache::CacheState const& state, std::ostream& os);
+    [[nodiscard]] static size_t serializedSize(kv_cache::CacheState const& state);
+
    // ContextPhaseState
    [[nodiscard]] static ContextPhaseState deserializeContextPhaseState(std::istream& is);
    static void serialize(ContextPhaseState const& contextPhaseState, std::ostream& os);
--- a/cpp/include/tensorrt_llm/executor/types.h
+++ b/cpp/include/tensorrt_llm/executor/types.h
@ -18,6 +18,7 @@

 #include <cstdint>
 #include <functional>
+#include <map>
 #include <memory>
 #include <optional>
 #include <string>
@ -361,6 +362,15 @@ struct RequestStatsPerIteration
    std::vector<RequestStats> requestStats;
 };

+/// @brief Struct that holds the debug tensors in an iteration
+struct DebugTensorsPerIteration
+{
+    /// @brief The iteration id for these tensors
+    IterationType iter;
+    /// @brief The debug tensors for this iteration
+    std::map<std::string, Tensor> debugTensors;
+};
+
 /// @brief The reason why the model stopped generating tokens for a request.
 enum class FinishReason
 {
--- a/cpp/include/tensorrt_llm/runtime/ipcUtils.h
+++ b/cpp/include/tensorrt_llm/runtime/ipcUtils.h
@ -32,7 +32,7 @@ public:
    using BufferPtr = IBuffer::SharedPtr;

    // MAX_ALL_REDUCE_BLOCKS for block_barrier, 1 for multi_gpu_barrier
-    size_t static constexpr FLAGS_SIZE = (kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t);
+    size_t static constexpr FLAGS_SIZE = (tensorrt_llm::kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t);

    IpcMemory(
        std::size_t bufferSize, BufferManager const& manager, WorldConfig const& worldConfig, bool openIpc = true);
--- a/cpp/include/tensorrt_llm/runtime/rawEngine.h
+++ b/cpp/include/tensorrt_llm/runtime/rawEngine.h
@ -17,9 +17,11 @@
 #pragma once

 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/executor/tensor.h"

 #include <NvInferRuntime.h>
 #include <filesystem>
+#include <map>
 #include <optional>

 namespace tensorrt_llm::runtime
@ -75,6 +77,17 @@ public:
        mEnginePath = std::move(enginePath);
    }

+    [[nodiscard]] std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> const&
+    getManagedWeightsMapOpt() const
+    {
+        return mManagedWeightsMap;
+    }
+
+    void setManagedWeightsMap(std::map<std::string, tensorrt_llm::executor::Tensor> managedWeightsMap)
+    {
+        mManagedWeightsMap = std::move(managedWeightsMap);
+    }
+
    [[nodiscard]] void const* getAddress() const
    {
        TLLM_CHECK(mType == AddressWithSize);
@ -104,6 +117,7 @@ private:
    };

    nvinfer1::IHostMemory const* mEngineBuffer{};
+    std::optional<std::map<std::string, tensorrt_llm::executor::Tensor>> mManagedWeightsMap;
 };

 } // namespace tensorrt_llm::runtime
--- a/cpp/tensorrt_llm/CMakeLists.txt
+++ b/cpp/tensorrt_llm/CMakeLists.txt
@ -186,6 +186,16 @@ find_package(Threads REQUIRED)
 target_link_libraries(${BATCH_MANAGER_TARGET} INTERFACE Threads::Threads)
 target_link_libraries(${EXECUTOR_TARGET} INTERFACE Threads::Threads)

+if(ENABLE_UCX)
+  find_package(ucx REQUIRED)
+  find_package(ucxx REQUIRED)
+  if(BUILD_BATCH_MANAGER)
+    target_include_directories(
+      ${BATCH_MANAGER_TARGET}
+      PRIVATE $<TARGET_PROPERTY:ucxx::ucxx,INTERFACE_INCLUDE_DIRECTORIES>)
+  endif()
+endif()
+
 if(NOT WIN32)
  if(USE_CXX11_ABI)
    add_custom_command(
@ -331,6 +341,10 @@ if(ENABLE_MULTI_DEVICE)
  set(TRTLLM_LINK_LIBS ${TRTLLM_LINK_LIBS} ${MPI_C_LIBRARIES} ${NCCL_LIB})
 endif()

+if(ENABLE_UCX)
+  set(TRTLLM_LINK_LIBS ${TRTLLM_LINK_LIBS} ucxx::ucxx ucx::ucs)
+endif()
+
 if(NOT WIN32) # Unix-like compilers
  set(UNDEFINED_FLAG "-Wl,--no-undefined")
  set(AS_NEEDED_FLAG "-Wl,--as-needed")
@ -366,6 +380,9 @@ target_link_libraries(${SHARED_TARGET} PUBLIC ${TRTLLM_LINK_LIBS})
 link_whole_archive(${SHARED_TARGET} ${BATCH_MANAGER_TARGET})
 link_whole_archive(${SHARED_TARGET} ${EXECUTOR_TARGET})
 link_whole_archive(${SHARED_TARGET} ${INTERNAL_CUTLASS_KERNELS_TARGET})
+if(ENABLE_UCX)
+  link_whole_archive(${SHARED_TARGET} ucxx::ucxx)
+endif()

 # Cyclic dependency of batch manager on TRT-LLM
 target_link_libraries(${BATCH_MANAGER_TARGET} INTERFACE ${SHARED_TARGET})
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5837518b278aa82cfdaeb3279bfe396de8e0638d31c3447f2eaa7443c22fa3f7
-size 4459926
+oid sha256:1ce35a0714ef753c5328aa982b1fefa58b90994bd87a6739634ec47ec9373f9e
+size 4565552
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ff7c6c16dc4755cfcf398d6b92ddd056ad6ae40f1dd830deeccd37ceb795edb6
-size 4567634
+oid sha256:1aa2a508d865410915b9ae4b21b11062a07d6143d19bc84fa53145da7911aa2a
+size 4667530
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-fe9c16bd1eed122234ece7f9afeea382 libtensorrt_llm_batch_manager_static.a
-040e15c175f987c30ebfdbcc8a9c2021 libtensorrt_llm_batch_manager_static.pre_cxx11.a
-052edd4c2bca0a186eed2169a9681d317f67a712 commit
+7a30229eedc22a924052cd5440c5adb4 libtensorrt_llm_batch_manager_static.a
+e46c1e13209f90acdcc8b5f0c9e8a15c libtensorrt_llm_batch_manager_static.pre_cxx11.a
+0e09af04945b8cb0eaf65780a7fa7907a2baee1a commit
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a7311134408b791dbd4f11ad3144f67314a8e6a288d14f12767004d79a82ac2
-size 4318978
+oid sha256:106cab5936a2ac034785050804890aa4deb1436983215439462c244475ebb90c
+size 4422078
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8181b7a634293d981caf0f5011c2619f541c02e818aae8fbff4bf4e72cea6cab
-size 4291752
+oid sha256:cb9ad53702b1fbe66dbb989a50422ab4cf01c53b4943d48296cc96f92742c363
+size 4388652
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-3869999ed0175550deb0d73d0ab0fd08 libtensorrt_llm_batch_manager_static.a
-e6e05e4c36d868dfb1f9c93c77993cbd libtensorrt_llm_batch_manager_static.pre_cxx11.a
-052edd4c2bca0a186eed2169a9681d317f67a712 commit
+251ebd85cff41a2af7f6dcca8489f8fb libtensorrt_llm_batch_manager_static.a
+801b1b6ffd0ab4ec3a66afeb010d97c4 libtensorrt_llm_batch_manager_static.pre_cxx11.a
+0e09af04945b8cb0eaf65780a7fa7907a2baee1a commit
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ce95b46f9ae1ff46967984f0c956a0b51bc7c57cebbf0ed6e553729ce84fe8b6
-size 26424318
+oid sha256:291630c536d8262087c9dd5f3bbd4c9b301aea8afbae4af9bc2cdce4db4e8f23
+size 27510016
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
@ -1,2 +1,2 @@
-6c8dbccd4cde7ca451e8e99ecb480f55 tensorrt_llm_batch_manager_static.lib
-052edd4c2bca0a186eed2169a9681d317f67a712 commit
+8873c98ec05794c5ebaf05c8da73dd65 tensorrt_llm_batch_manager_static.lib
+0e09af04945b8cb0eaf65780a7fa7907a2baee1a commit
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:356deeea43040cf5529f89bd880028b8cf0a0600967df382b4107e796d9a301c
-size 1630654
+oid sha256:1f7087b56c34700e048ee9d40086b34d65952e66507ea36986ab11260e0a3300
+size 1759444
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:115bf7be63c22bb7fd69cafa600ebe9785b104e78e1a7e5b475bcbe1d2069037
-size 1655182
+oid sha256:3288603c000a6eecb7a28319c69536ac0b37b2337474330b61442b9940e1d988
+size 1787862
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-aae0acac4fab096666be84b6e630bd71 libtensorrt_llm_executor_static.a
-a174142f8f74f1c1a439cc3d040b0b5f libtensorrt_llm_executor_static.pre_cxx11.a
-052edd4c2bca0a186eed2169a9681d317f67a712 commit
+cfe12cab670a58d56b0a2d881c218015 libtensorrt_llm_executor_static.a
+b177ad21ac5636091ee267a5a550aa77 libtensorrt_llm_executor_static.pre_cxx11.a
+0e09af04945b8cb0eaf65780a7fa7907a2baee1a commit
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e5beefedcd2309d37550bac7e9d9ec0cf7f7c18d2fbf9a2d9bf9c7625954b6c8
-size 1694400
+oid sha256:f565a5225dec3f1f88df931d8f8a0718f2ab24f705a639ae4ebf358b37b4555d
+size 1824992
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:89acdd3e6b904f9dc86bcf8e73fe03c43b82ceab5d325357ea1f3ed1ef797aaf
-size 1615086
+oid sha256:21aaf66b1435d4fec2f41ce0029522da71509cb7c6a856bbf9411c88c105cd5c
+size 1735024
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-5bebbd31919ac2b34579c8653295dfd2 libtensorrt_llm_executor_static.a
-2332b6ea1e0b8683844168949f9dfb9c libtensorrt_llm_executor_static.pre_cxx11.a
-052edd4c2bca0a186eed2169a9681d317f67a712 commit
+4dcc8b42ff4afe11178bc2f145394b41 libtensorrt_llm_executor_static.a
+ead139d0835f7e86d2bf7ecd41ad0999 libtensorrt_llm_executor_static.pre_cxx11.a
+0e09af04945b8cb0eaf65780a7fa7907a2baee1a commit
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6fceeee8a39af844a5cfefd32d3b2b024b659f5d7dceacc0f5dd0b69b5d37b7c
-size 17485396
+oid sha256:9763058bec6c637ec101384b9e89681ff315ddd514fb1e37fde8ef5c51de540a
+size 19341056
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
@ -1,2 +1,2 @@
-ac61b12b5aa440f5f8f0e05511a12d17 tensorrt_llm_executor_static.lib
-052edd4c2bca0a186eed2169a9681d317f67a712 commit
+94894f3c80436b5dfbb6864dbe686baa tensorrt_llm_executor_static.lib
+0e09af04945b8cb0eaf65780a7fa7907a2baee1a commit
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_q_paged_kv_48_sm80.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_q_paged_kv_48_sm80.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_q_paged_kv_48_sm86.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_q_paged_kv_48_sm86.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_q_paged_kv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_q_paged_kv_48_sm89.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm80.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm80.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm86.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm86.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm89.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_128_128_S_qkv_48_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_48_sm80.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_48_sm80.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_48_sm86.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_48_sm86.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_48_sm89.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm80.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm80.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm86.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm86.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm89.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_48_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_alibi_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_104_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_alibi_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_40_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_alibi_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_104_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_alibi_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_40_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_48_sm80.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_48_sm80.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_48_sm86.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_48_sm86.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_48_sm89.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm80.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm80.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm86.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm86.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm89.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_qkv_48_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_q_paged_kv_48_sm70.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_q_paged_kv_48_sm70.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_q_paged_kv_48_sm80.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_q_paged_kv_48_sm80.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_q_paged_kv_48_sm86.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_q_paged_kv_48_sm86.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_q_paged_kv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_q_paged_kv_48_sm89.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm70.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm70.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm80.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm80.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm86.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm86.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm89.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_48_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_q_paged_kv_48_sm80.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_q_paged_kv_48_sm80.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_q_paged_kv_48_sm86.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_q_paged_kv_48_sm86.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm80.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm80.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm86.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm86.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm89.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_128_128_S_qkv_48_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_alibi_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_q_paged_kv_48_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_alibi_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_256_S_qkv_48_tma_ws_sm90.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_q_paged_kv_48_sm80.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_q_paged_kv_48_sm80.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_q_paged_kv_48_sm86.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_q_paged_kv_48_sm86.cubin.cpp
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_q_paged_kv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_q_paged_kv_48_sm89.cubin.cpp
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Subproject commit b99181779672965c6f325a95a29eb433b6e9cbbd`