Merge branch 'main' into update_mnnvl_test

This commit is contained in:
Bo Li 2025-12-17 01:57:20 +08:00 committed by GitHub
commit 6b6daf31a8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3346 changed files with 66356 additions and 24893 deletions

2
.gitattributes vendored
View File

@ -12,3 +12,5 @@ tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text
docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text
docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text
docs/source/blogs/media/tech_blog10_context_wait_performance.png filter=lfs diff=lfs merge=lfs -text
cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp filter=lfs diff=lfs merge=lfs -text
cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp filter=lfs diff=lfs merge=lfs -text

View File

@ -0,0 +1,127 @@
name: Manage Waiting for Feedback Label
on:
issue_comment:
types: [created]
pull_request_review_comment:
types: [created]
permissions:
issues: write
pull-requests: write
jobs:
manage-waiting-for-feedback:
runs-on: ubuntu-latest
if: github.repository == 'NVIDIA/TensorRT-LLM'
steps:
- name: Check membership and manage label
uses: actions/github-script@v8
with:
script: |
const commenter = context.payload.comment.user.login;
const commenterType = context.payload.comment.user.type;
const label = 'waiting for feedback';
// Ignore bots and CI accounts
const ignoredAccounts = ['tensorrt-cicd'];
if (commenterType === 'Bot' || ignoredAccounts.includes(commenter)) {
console.log(`Ignoring comment from ${commenter} (type: ${commenterType}). Skipping.`);
return;
}
// Handle both issue_comment and pull_request_review_comment events
// context.issue.number is only available for issue_comment events
const issueNumber = context.issue?.number || context.payload.pull_request?.number;
const issue = context.payload.issue || context.payload.pull_request;
const author = issue?.user?.login;
const isAuthor = (commenter === author);
if (!issueNumber) {
console.log('Could not determine issue/PR number. Skipping.');
return;
}
console.log(`Comment by ${commenter} on #${issueNumber} (author: ${author})`);
const owner = context.repo.owner;
const repo = context.repo.repo;
// Check if commenter is repository member
let isMember = false;
try {
await github.rest.repos.checkCollaborator({
owner,
repo,
username: commenter
});
isMember = true;
} catch (error) {
if (error.status === 404) {
isMember = false;
} else if (error.status === 302) {
console.log(`Cannot determine membership for ${commenter} (insufficient token permissions)`);
return;
} else {
console.error(`Error checking membership: ${error.message}`);
throw error;
}
}
// Logic:
// - Author responds → remove label (feedback provided)
// - NVIDIA non-author comments → add label (team is waiting for response)
// - External non-author comments → remove label (someone provided feedback)
if (isAuthor) {
// Author responded - remove 'waiting for feedback' label
console.log(`${commenter} is the author. Removing '${label}' label if present.`);
try {
await github.rest.issues.removeLabel({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: issueNumber,
name: label
});
console.log(`Successfully removed '${label}' label from #${issueNumber}`);
} catch (error) {
if (error.status === 404) {
console.log(`Label '${label}' was not present on #${issueNumber}. No action needed.`);
} else {
throw error;
}
}
} else if (isMember) {
// NVIDIA non-author commented - add 'waiting for feedback' label
console.log(`${commenter} is an NVIDIA member (not author). Adding '${label}' label.`);
await github.rest.issues.addLabels({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: issueNumber,
labels: [label]
});
console.log(`Successfully added '${label}' label to #${issueNumber}`);
} else {
// External non-author commented - remove 'waiting for feedback' label
console.log(`${commenter} is external (not author). Removing '${label}' label if present.`);
try {
await github.rest.issues.removeLabel({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: issueNumber,
name: label
});
console.log(`Successfully removed '${label}' label from #${issueNumber}`);
} catch (error) {
if (error.status === 404) {
console.log(`Label '${label}' was not present on #${issueNumber}. No action needed.`);
} else {
throw error;
}
}
}

6
.gitignore vendored
View File

@ -55,6 +55,8 @@ tensorrt_llm/scripts
*docs/source/_cpp_gen*
docs/source/**/*.rst
!docs/source/examples/index.rst
!docs/source/deployment-guide/config_table.rst
!docs/source/deployment-guide/note_sections.rst
*.swp
# Testing
@ -72,6 +74,7 @@ llm-test-workspace/
cpp/include/tensorrt_llm/executor/version.h
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.cpp
.devcontainer/.env
/examples/layer_wise_benchmarks/profiles/
@ -86,3 +89,6 @@ compile_commands.json
# Enroot sqsh files
enroot/sw-tensorrt-docker+*.sqsh
enroot/tensorrt_llm.devel.sqsh
# MacOSX Files
.DS_Store

View File

@ -1395,6 +1395,8 @@ repos:
- id: check-symlinks
- id: detect-private-key
- id: end-of-file-fixer
exclude: |
(?x)^(.*cubin.cpp | .*cubin.h)$
- id: check-yaml
args: [--allow-multiple-documents]
exclude: ".*/gitlab/.*.yml"
@ -1439,7 +1441,7 @@ repos:
additional_dependencies:
- tomli
# add ignore words list
args: ["-L", "Mor,ans,thirdparty", "--skip", "ATTRIBUTIONS-*.md,*.svg", "--skip", "security_scanning/*"]
args: ["-L", "Mor,ans,thirdparty,subtiles", "--skip", "ATTRIBUTIONS-*.md,*.svg", "--skip", "security_scanning/*"]
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.4
hooks:

File diff suppressed because it is too large Load Diff

View File

@ -8,33 +8,9 @@
## Coding Guidelines
* Coding style for TensorRT-LLM can be found [in this document](CODING_GUIDELINES.md).
TensorRT-LLM Coding Style can be found [in this document](CODING_GUIDELINES.md).
* All contributed C++ code should be formatted following the rules in TensorRT-LLM's [clang-format](.clang-format) file. The recommended version is clang-format>=14.0.
* Changes can be formatted with the following command:
```bash
# Commit ID is optional - if unspecified, run format on staged changes.
git-clang-format --style file [commit ID/reference]
```
* All contributed Python code should be formatted using the `black` Python package. The recommended version is `black>=23.0`
* Changes can be formatted with the following command:
```bash
git diff --name-only | grep "*.py" | xargs black -l 120
```
* Try to keep pull requests (PRs) as concise as possible:
* Avoid committing commented-out code.
* Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes.
## Coding Style
We use `pre-commit` for automatic code formatting and validation. Install the `pre-commit` package in your local
Python environment.
We use `pre-commit` for automatic code formatting and validation. Install the `pre-commit` package in your local Python environment.
```bash
pip install pre-commit
@ -73,6 +49,9 @@ mdformat.................................................................Passed
If any files were modified by this hook, you will need to stage and commit them again.
In addition, please try to keep pull requests (PRs) as concise as possible:
* Avoid committing commented-out code.
* Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes.
## Pull Requests

134
LICENSE
View File

@ -1,7 +1,84 @@
Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Copyright (c) 2011-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Portions of this project are under the following copyright:
- Copyright contributors to the vLLM project
This project is licensed under the Apache 2.0 license, whose full license text is available below.
This project contains portions of code that are based on or derived from
other open source projects, which may have different licenses whose text
is available below.
All modifications and additions to other projects are licensed under the
Apache License 2.0 unless otherwise specified. Please refer to the individual
file headers for specific copyright and license information.
Below is a list of other projects that have portions contained by this project:
--------------------------------------------------------------------------------
causal-conv1d
--------------------------------------------------------------------------------
Original Source: https://github.com/Dao-AILab/causal-conv1d
Copyright (c) 2024, Tri Dao.
Licensed under the BSD 3-Clause License
--------------------------------------------------------------------------------
flash-linear-attention
--------------------------------------------------------------------------------
Original Source: https://github.com/fla-org/flash-linear-attention
Copyright (c) 2023-2025 Songlin Yang
Licensed under the MIT License
--------------------------------------------------------------------------------
InstructEval
--------------------------------------------------------------------------------
Original Source: https://github.com/declare-lab/instruct-eval
Copyright (c) 2020 Dan Hendrycks
Copyright (c) 2023 Deep Cognition and Language Research (DeCLaRe) Lab
Licensed under the MIT License
--------------------------------------------------------------------------------
Mamba
--------------------------------------------------------------------------------
Original Source: https://github.com/state-spaces/mamba
Copyright 2023 Tri Dao, Albert Gu
Licensed under the Apache License 2.0
--------------------------------------------------------------------------------
SGLang
--------------------------------------------------------------------------------
Original Source: https://github.com/sgl-project/sglang
Copyright contributors to the SGLang project
Licensed under the Apache License 2.0
--------------------------------------------------------------------------------
Text Generation Inference
--------------------------------------------------------------------------------
Original Source: https://github.com/huggingface/text-generation-inference
Copyright 2022 Hugging Face
Licensed under the Apache License 2.0
--------------------------------------------------------------------------------
Transformers
--------------------------------------------------------------------------------
Original Source: https://github.com/huggingface/transformers
Copyright 2018 The HuggingFace Team
Licensed under the Apache License 2.0
--------------------------------------------------------------------------------
XGrammar
--------------------------------------------------------------------------------
Original Source: https://github.com/mlc-ai/xgrammar
Copyright (c) 2024 by XGrammar Contributors
Licensed under the Apache License 2.0
--------------------------------------------------------------------------------
vLLM
--------------------------------------------------------------------------------
Original Source: https://github.com/vllm-project/vllm
Copyright contributors to the vLLM project
Licensed under the Apache License 2.0
================================================================================
Apache 2.0 LICENSE
================================================================================
Apache License
Version 2.0, January 2004
@ -204,3 +281,54 @@ Portions of this project are under the following copyright:
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================================================
MIT LICENSE
================================================================================
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================================================
BSD 3-Clause License
================================================================================
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -10,7 +10,7 @@ state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.<
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-13.0.0-green)](https://developer.nvidia.com/cuda-downloads)
[![torch](https://img.shields.io/badge/torch-2.9.0-green)](https://pytorch.org)
[![version](https://img.shields.io/badge/release-1.2.0rc5-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
[![version](https://img.shields.io/badge/release-1.2.0rc6-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)
[Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](https://nvidia.github.io/TensorRT-LLM/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)

View File

@ -49,7 +49,7 @@ class RootArgs(BaseModel):
return self
@click.group()
@click.group(deprecated=True)
@click.option(
"--tokenizer",
required=True,

View File

@ -1,6 +1,7 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
*AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@ -17,13 +18,16 @@
*/
#include "utils.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/logger.h"
#include <random>
#include <filesystem>
#include <fstream>
namespace tensorrt_llm::benchmark
TRTLLM_NAMESPACE_BEGIN
namespace benchmark
{
std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
@ -98,7 +102,8 @@ Samples parseWorkloadJson(
if (samples.size() < maxNumSamples)
{
TLLM_LOG_WARNING(
"Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n",
"Dataset size %zu is smaller than given max_num_samples "
"%d, max_num_samples will be ignored.\n",
samples.size(), maxNumSamples);
}
return samples;
@ -160,4 +165,6 @@ std::ostream& operator<<(std::ostream& os, RecordBwMetric const& metric)
return os;
}
} // namespace tensorrt_llm::benchmark
} // namespace benchmark
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/executor/executor.h"
#include <cstdint>
@ -29,7 +30,9 @@
#pragma once
namespace tensorrt_llm::benchmark
TRTLLM_NAMESPACE_BEGIN
namespace benchmark
{
// using namespace tensorrt_llm::batch_manager;
@ -237,4 +240,6 @@ std::vector<double> generateRandomExponentialValues(int count, float lambda, int
std::vector<double> computeTimeDelays(BenchmarkParams const& benchmarkParams, int numDelays);
} // namespace tensorrt_llm::benchmark
} // namespace benchmark
TRTLLM_NAMESPACE_END

View File

@ -1,2 +1,5 @@
# These vulnerabilities were inherited from the base image (pytorch:25.10-py3) and should be removed when the base image
# is updated.
# WAR against https://github.com/advisories/GHSA-gm62-xv2j-4w53
# WAR against https://github.com/advisories/GHSA-2xpw-w6gg-jr37
urllib3>=2.6.0

View File

@ -78,9 +78,7 @@ using VecUniqueTokens = tensorrt_llm::runtime::VecUniqueTokens;
using LoraTaskIdType = tensorrt_llm::runtime::LoraTaskIdType;
using BlocksPerWindow = std::map<SizeType32, std::tuple<SizeType32, SizeType32>>;
using CacheSaltIDType = tensorrt_llm::runtime::CacheSaltIDType;
// Type alias for multimodal hash key (hash array + start offset)
using MmKey = std::pair<std::array<uint8_t, 32>, SizeType32>;
using MmKey = tensorrt_llm::executor::MmKey;
template <typename T>
using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
@ -325,6 +323,8 @@ public:
size_t getHash() const;
std::vector<MmKey> getExtraKeys() const;
private:
// Linear ID of block independent of pool
IdType mBlockId;

View File

@ -16,8 +16,9 @@
#pragma once
namespace tensorrt_llm
{
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
// Base class for algorithms
struct Algorithm
@ -29,4 +30,4 @@ struct Algorithm
Algorithm& operator=(Algorithm const&) = delete;
};
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -17,9 +17,13 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include <cstdint>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
//!
@ -100,4 +104,6 @@ private:
size_type mSize;
};
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,14 +16,19 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/tllmException.h"
TRTLLM_NAMESPACE_BEGIN
class DebugConfig
{
public:
static bool isCheckDebugEnabled();
};
TRTLLM_NAMESPACE_END
#if defined(_WIN32)
#define TLLM_LIKELY(x) (__assume((x) == 1), (x))
#define TLLM_UNLIKELY(x) (__assume((x) == 0), (x))
@ -35,8 +40,8 @@ public:
#define TLLM_CHECK(val) \
do \
{ \
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
TLLM_LIKELY(static_cast<bool>(val)) \
? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
} while (0)
#define TLLM_CHECK_WITH_INFO(val, info, ...) \
@ -51,17 +56,17 @@ public:
#define TLLM_CHECK_DEBUG(val) \
do \
{ \
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \
if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \
{ \
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
TLLM_LIKELY(static_cast<bool>(val)) \
? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
} \
} while (0)
#define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...) \
do \
{ \
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \
if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \
{ \
TLLM_LIKELY(static_cast<bool>(val)) \
? ((void) 0) \

View File

@ -17,9 +17,13 @@
#pragma once
#include "c10/util/intrusive_ptr.h"
#include "tensorrt_llm/common/config.h"
#include <Python.h>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
// Adapted from pybind11's example implementation:
@ -69,4 +73,6 @@ c10::intrusive_ptr<T> get_intrusive_ptr(PyObject* py_obj, std::string pybind11_a
return c10::intrusive_ptr<T>::reclaim_copy(p);
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -0,0 +1,62 @@
/*
* Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifndef TRTLLM_CONFIG_H
#define TRTLLM_CONFIG_H
/**
* \def TRTLLM_ABI_NAMESPACE
* This macro is used to open an implicitly inline namespace block for the ABI version.
* This macro can be overridden to change the ABI version.
* The default ABI version is _v1.
*/
#ifndef TRTLLM_ABI_NAMESPACE
#define TRTLLM_ABI_NAMESPACE _v1
#endif
#ifndef TRTLLM_ABI_NAMESPACE_BEGIN
#define TRTLLM_ABI_NAMESPACE_BEGIN \
inline namespace TRTLLM_ABI_NAMESPACE \
{
#endif
#ifndef TRTLLM_ABI_NAMESPACE_END
#define TRTLLM_ABI_NAMESPACE_END }
#endif
/**
* \def TRTLLM_NAMESPACE_BEGIN
* This macro is used to open a `tensorrt_llm::` namespace block, along with any
* enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
* This macro is defined by TensorRT-LLM and may not be overridden.
*/
#define TRTLLM_NAMESPACE_BEGIN \
namespace tensorrt_llm \
{ \
TRTLLM_ABI_NAMESPACE_BEGIN
/**
* \def TRTLLM_NAMESPACE_END
* This macro is used to close a `tensorrt_llm::` namespace block, along with any
* enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
* This macro is defined by TensorRT-LLM and may not be overridden.
*/
#define TRTLLM_NAMESPACE_END \
TRTLLM_ABI_NAMESPACE_END \
} /* end namespace tensorrt_llm */
#endif // TRTLLM_CONFIG_H

View File

@ -16,6 +16,8 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#ifdef ENABLE_FP8
#include <cuda_fp8.h>
#include <cuda_runtime.h>
@ -29,8 +31,8 @@
#define USE_QGMMA
#endif
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -320,5 +322,6 @@ void invokeComputeScalesAndQuantizeMatrix(T_OUT* output, T_S* quant_ptr, const T
const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream);
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END
#endif // ENABLE_FP8

View File

@ -14,12 +14,18 @@
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/common/config.h"
#include <cstdint>
#include <optional>
#include <string>
#include <unordered_set>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
/// @brief Populate the start and end profiling iteration indexes from the provided environment variables
@ -28,4 +34,6 @@ namespace tensorrt_llm::common
std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
std::string const& envVarName, std::optional<std::string> const& legacyEnvVarName = std::nullopt);
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
*/
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
#include "tensorrt_llm/common/cudaDriverWrapper.h"
#include "tensorrt_llm/common/cudaFp8Utils.h"
@ -49,7 +50,9 @@
// this undef.
#endif // WIN32
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
// workspace for cublas gemm : 32MB
@ -1417,7 +1420,9 @@ DEFINE_MEMBER_CHECKER(deq)
DEFINE_MEMBER_CHECKER(qua)
DEFINE_MEMBER_CHECKER(high_preciecion_normed_output)
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END
/*
* Macros compliant with TensorRT coding conventions

View File

@ -16,11 +16,15 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/tllmException.h"
#include <NvInferRuntime.h>
#include <map>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
constexpr static size_t getDTypeSize(nvinfer1::DataType type)
@ -84,4 +88,6 @@ constexpr static size_t getDTypeSizeInBits(nvinfer1::DataType type)
return "";
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -22,9 +22,12 @@
#include <string>
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/stringUtils.h"
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
class Logger
@ -125,12 +128,12 @@ private:
static inline std::string getPrefix(Level const level)
{
return fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
return tensorrt_llm::common::fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
}
static inline std::string getPrefix(Level const level, int const rank)
{
return fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
return tensorrt_llm::common::fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
}
};
@ -171,6 +174,9 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
out << std::endl;
}
}
} // namespace common
TRTLLM_NAMESPACE_END
#define TLLM_LOG(level, ...) \
do \
@ -188,4 +194,3 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
#define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__)
#define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__)
#define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__)
} // namespace tensorrt_llm::common

View File

@ -16,11 +16,15 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include <functional>
#include <memory>
#include <optional>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
/**
@ -100,4 +104,6 @@ public:
}
};
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,12 +16,14 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include <cstdint>
#include <optional>
#include <string>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -480,4 +482,5 @@ public:
};
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#if ENABLE_BF16
#include <cuda_bf16.h>
#endif // ENABLE_BF16
@ -28,7 +29,9 @@
#include <unordered_set>
#include <vector>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
#if ENABLE_BF16
static inline std::basic_ostream<char>& operator<<(std::basic_ostream<char>& stream, __nv_bfloat16 const& val)
@ -228,4 +231,6 @@ inline void toUpper(std::string& s)
}
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/stringUtils.h"
#include <array>
@ -41,7 +42,9 @@
tensorrt_llm::common::RequestSpecificException( \
__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str(), requestID, errorCode)
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
/// @brief Enumeration of different error codes for request-specific exceptions
@ -77,7 +80,8 @@ private:
[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, char const* info)
{
throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str());
throw TllmException(
file, line, tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str());
}
[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "")
@ -102,4 +106,6 @@ private:
RequestErrorCode mErrorCode;
};
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,8 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include <algorithm>
#include <initializer_list>
#include <string>
@ -24,7 +26,9 @@
#include <pthread.h>
#endif
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
inline bool setThreadName(std::string const& name)
@ -43,4 +47,6 @@ bool contains(std::initializer_list<T> const& c, T const& v)
return std::find(c.begin(), c.end(), v) != c.end();
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -66,6 +66,7 @@ public:
[[nodiscard]] virtual std::vector<Connection const*> getConnections(CommState const& state) = 0;
[[nodiscard]] virtual CommState const& getCommState() const = 0;
[[nodiscard]] virtual bool isRunning() const = 0;
};
} // namespace tensorrt_llm::executor::kv_cache

View File

@ -47,6 +47,12 @@ class BaseKVCacheManager;
namespace tensorrt_llm::executor
{
using SizeType32 = tensorrt_llm::runtime::SizeType32;
// Mmkey is used in KVCacheBlock when multimodal data presents in a block.
// Type alias for hash array + start offset at per-block granularity.
// This differs from the per-request level multimodal hash in MultimodalInput.
using MmKey = std::pair<std::array<uint8_t, 32>, SizeType32>;
/// @brief Version of TRT-LLM
char const* version() noexcept;
@ -1691,12 +1697,14 @@ struct KVCacheStoredBlockData
{
KVCacheStoredBlockData(IdType blockHash, tensorrt_llm::runtime::VecUniqueTokens tokens,
std::optional<tensorrt_llm::runtime::LoraTaskIdType> loraId, SizeType32 cacheLevel, SizeType32 priority)
std::optional<tensorrt_llm::runtime::LoraTaskIdType> loraId, SizeType32 cacheLevel, SizeType32 priority,
std::vector<MmKey> mmKeys = {})
: blockHash{blockHash}
, tokens{std::move(tokens)}
, loraId{loraId}
, cacheLevel{cacheLevel}
, priority{priority}
, mmKeys{std::move(mmKeys)}
{
}
@ -1710,6 +1718,8 @@ struct KVCacheStoredBlockData
SizeType32 cacheLevel;
/// @brief The priority of the block
SizeType32 priority;
/// @brief The multimodal keys of the block
std::vector<MmKey> mmKeys;
};
struct KVCacheStoredData

View File

@ -16,7 +16,11 @@
#pragma once
namespace tensorrt_llm::kernels
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
namespace detail
@ -110,4 +114,6 @@ inline constexpr bool is_compatible_v = is_compatible<Arch>::value;
} // namespace arch
} // namespace tensorrt_llm::kernels
} // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -17,11 +17,14 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/executor/types.h"
#include <cstdint>
#include <curand_kernel.h>
namespace tensorrt_llm::kernels
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
class FinishedState
@ -308,4 +311,6 @@ template <typename T>
void invokeScatterDecodingParams(
T const* src, T scalar, T* dst, int const* batchSlots, int batchSize, cudaStream_t stream);
} // namespace tensorrt_llm::kernels
} // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -17,11 +17,14 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include <cstdint>
#include <cuda_runtime.h>
namespace tensorrt_llm::kernels
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
class KVCacheIndex
@ -53,4 +56,6 @@ private:
UnderlyingType value;
};
} // namespace tensorrt_llm::kernels
} // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -14,16 +14,18 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/runtime/iBuffer.h"
using namespace tensorrt_llm::runtime;
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
cudaStream_t stream);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -1,18 +1,18 @@
# ##################################################################################################
# Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2011-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Redistribution and use in source and binary forms, with or without modification, are not permit-
# ted.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFIT;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ##################################################################################################
# #################################################################################################

View File

@ -200,38 +200,22 @@ ns_close = r"""
copyright = '''\
/***************************************************************************************************
* Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Redistribution and use in source and binary forms, with or without modification, are not permit-
* ted.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**************************************************************************************************/
''' if not generate_cu_trtllm else r"""/*
* SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
"""
'''
makefile_template = '''\
@ -2175,7 +2159,8 @@ def get_kernel_code(kspec, kname, lname):
params_str = 'reinterpret_cast<bert::Fused_multihead_attention_params_v2 &>(params)' if generate_cu_trtllm else 'params'
attn_mask_type_str = 'using Attention_mask_type = ContextAttentionMaskType;' if generate_cu_trtllm else 'using Attention_mask_type = fmha::Attention_mask_type;'
bert_launch_params = '' if generate_cu_trtllm else 'using Launch_params = bert::Fused_multihead_attention_launch_params;'
include_str = '#include "../fused_multihead_attention_common.h"' if generate_cu_trtllm else ''
include_str = '#include "../fused_multihead_attention_common.h"\n' if generate_cu_trtllm else ''
include_str += '#include "tensorrt_llm/common/config.h"' if generate_cu_trtllm else ''
num_compute_groups_str = '' if generate_cu_trtllm else 'static constexpr int NUM_COMPUTE_GROUPS = 2;'
fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'{params_type}'
const_fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'const {params_type}'
@ -2201,8 +2186,19 @@ def get_kernel_code(kspec, kname, lname):
const int COMPUTE_REG_COUNT = {compute_reg_count};
asm volatile("{{setmaxnreg.inc.sync.aligned.u32 %0; \n\t}}" ::"n"(COMPUTE_REG_COUNT));'''.format(
compute_reg_count=compute_reg_count)
local_ns_open = ns_open if generate_cu_trtllm else ''
local_ns_close = ns_close if generate_cu_trtllm else ''
abi_ns_open = r"""
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
// clang-format off
"""
abi_ns_close = r"""
// clang-format on
} // namespace kernels
TRTLLM_NAMESPACE_END
"""
local_ns_open = abi_ns_open if generate_cu_trtllm else ''
local_ns_close = abi_ns_close if generate_cu_trtllm else ''
tmp = dict(locals(), **kspec._asdict())
@ -3077,8 +3073,10 @@ def use_cubin_header(sm, head_size, dtype, output_dtype=None):
def get_cubin_header(kernel_traits, specs_names):
cubins = []
cubin_lens = []
launchers = []
cubins_dict = {}
cubin_lens_dict = {}
launchers_dict = {}
for kspec, fname, lname, kname in specs_names:
if generate_cu_trtllm and not use_cubin_header(
kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype):
@ -3282,11 +3280,11 @@ def get_cubin_header(kernel_traits, specs_names):
if generate_cu_trtllm and lname != 'nullptr':
launcher = 'extern void {lname}(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);'.format(
lname=lname)
if int(sm) in cubins_dict:
if launcher not in cubins_dict[int(sm)]:
cubins_dict[int(sm)].append(launcher)
if int(sm) in launchers_dict:
if launcher not in launchers_dict[int(sm)]:
launchers_dict[int(sm)].append(launcher)
else:
cubins_dict[int(sm)] = [launcher]
launchers_dict[int(sm)] = [launcher]
elif 'mhca' in kname:
code = '''\
{{ DATA_TYPE_{prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, kSM_{sm}, {cubin_name}, {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {is_il} }}\
@ -3309,17 +3307,33 @@ def get_cubin_header(kernel_traits, specs_names):
else:
metadata_v2 = ',\n'.join(metadata_v2)
# Add macros to only include needed cubins during compilation.
for sm in cubins_dict.keys():
# Collect all SM versions from all dictionaries
all_sms = sorted(
set(
list(cubins_dict.keys()) + list(cubin_lens_dict.keys()) +
list(launchers_dict.keys())))
for sm in all_sms:
macro_begin = f"#ifndef EXCLUDE_SM_{sm}"
macro_end = f"#endif\n"
cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end])
# Add cubin array declarations
if sm in cubins_dict:
cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end])
# Add cubin length declarations
if sm in cubin_lens_dict:
cubin_lens.extend([macro_begin] + cubin_lens_dict[sm] + [macro_end])
# Add launcher declarations
if sm in launchers_dict:
launchers.extend([macro_begin] + launchers_dict[sm] + [macro_end])
unroll_config_v1 = ',\n'.join(unroll_config_v1)
unroll_config_v2 = ',\n'.join(unroll_config_v2)
cubins = '\n'.join(cubins)
cubin_lens = '\n'.join(cubin_lens)
launchers = '\n'.join(launchers)
local_ns_open = ns_open
local_ns_close = ns_close if generate_cu_trtllm else '}'
launcher_line = '''
@ -3431,7 +3445,157 @@ static const struct TestMetaV2
'''.format(**locals(), copyright=copyright)
return code
# Generate header content (.h file)
if "GENERATE_CUBIN" in os.environ:
header_content = '''\
{copyright}
#pragma once
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace kernels{{
struct FusedMultiHeadAttentionKernelMetaInfoV2
{{
Data_type mDataTypeIn;
Data_type mDataTypeOut;
unsigned int mS;
unsigned int mStepQ;
unsigned int mStepKV;
unsigned int mD;
unsigned int mDV;
unsigned int mSageBlockSizeQ;
unsigned int mSageBlockSizeK;
unsigned int mSageBlockSizeV;
unsigned int mSM;
const unsigned char* mCubin;
unsigned int mCubinSize;
const char* mFuncName;
unsigned int mSharedMemBytes;
unsigned int mThreadsPerCTA;
unsigned int mUnrollStep;
int mAttentionMaskType;
int mAttentionInputLayout;
bool mInterleaved;
bool mFlashAttention;
bool mWarpSpecialization;
bool mFP32Accumulation;
bool mAlibiSupported;
bool mTiled;
bool mEnableAttnLogitSoftcapping;
bool mReturnSoftmaxStats;{launcher_line}
}};
extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[];
extern const int sMhaKernelMetaInfosV2Size;
}} // namespace kernels
TRTLLM_NAMESPACE_END
'''.format(**locals(), copyright=copyright)
# Generate source content (.cpp file)
source_content = '''\
{copyright}
#include "tensorrt_llm/common/config.h"
#include <cstddef>
#include <cstdint>
#include <cuda_runtime_api.h>
{local_ns_open}
//--- Cubin Arrays
{cubins}
//--- Cubin Lengths
{cubin_lens}
{local_ns_close}
using namespace tensorrt_llm::kernels;
namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels {{
class Fused_multihead_attention_params_v2;
class Launch_params;
//--- Kernel Launchers
{launchers}
// FIXME: These are duplicated declarations, we should remove them in the future.
constexpr int32_t kSM_70 = 70;
constexpr int32_t kSM_72 = 72;
constexpr int32_t kSM_75 = 75;
constexpr int32_t kSM_80 = 80;
constexpr int32_t kSM_86 = 86;
constexpr int32_t kSM_89 = 89;
constexpr int32_t kSM_90 = 90;
constexpr int32_t kSM_100 = 100;
constexpr int32_t kSM_100f = 10100;
constexpr int32_t kSM_103 = 103;
constexpr int32_t kSM_120 = 120;
constexpr int32_t kSM_121 = 121;
// FIXME: These are duplicated declarations, we should remove them in the future.
enum Data_type
{{
DATA_TYPE_BOOL,
DATA_TYPE_FP16,
DATA_TYPE_FP32,
DATA_TYPE_INT4,
DATA_TYPE_INT8,
DATA_TYPE_INT32,
DATA_TYPE_BF16,
DATA_TYPE_E2M1,
DATA_TYPE_E4M3,
DATA_TYPE_E5M2
}};
struct FusedMultiHeadAttentionKernelMetaInfoV2
{{
Data_type mDataTypeIn;
Data_type mDataTypeOut;
unsigned int mS;
unsigned int mStepQ;
unsigned int mStepKV;
unsigned int mD;
unsigned int mDV;
unsigned int mSageBlockSizeQ;
unsigned int mSageBlockSizeK;
unsigned int mSageBlockSizeV;
unsigned int mSM;
const unsigned char* mCubin;
unsigned int mCubinSize;
const char* mFuncName;
unsigned int mSharedMemBytes;
unsigned int mThreadsPerCTA;
unsigned int mUnrollStep;
int mAttentionMaskType;
int mAttentionInputLayout;
bool mInterleaved;
bool mFlashAttention;
bool mWarpSpecialization;
bool mFP32Accumulation;
bool mAlibiSupported;
bool mTiled;
bool mEnableAttnLogitSoftcapping;
bool mReturnSoftmaxStats;{launcher_line}
}};
extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[] = {{
{metadata_v2}
}};
extern const int sMhaKernelMetaInfosV2Size = sizeof(sMhaKernelMetaInfosV2) / sizeof(sMhaKernelMetaInfosV2[0]);
}} // namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels
'''.format(**locals(), copyright=copyright)
else:
# Non-GENERATE_CUBIN mode: use old behavior
header_content = code
source_content = None
return header_content, source_content
# This is used to add some kernels running in cubins for passing CI cases.
@ -3449,9 +3613,20 @@ def modify_cubin_header(cubin_header):
return result
target = "#ifndef EXCLUDE_SM_80"
addition = """extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;"""
result = add_kernel_line(result, target, addition)
addition_cubin_array = """
#ifndef EXCLUDE_SM_80
extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
#endif
"""
addition_cubin_length = """
#ifndef EXCLUDE_SM_80
extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;
#endif
"""
# Add cubin array and length into there corresponding sections.
result = add_kernel_line(result, "//--- Cubin Arrays", addition_cubin_array)
result = add_kernel_line(result, "//--- Cubin Lengths",
addition_cubin_length)
def modify_kernel_line(result, target, new_line):
lines = result.split('\n')
@ -3534,13 +3709,22 @@ def generate_files(specs_names):
output = output.decode('utf-8').strip()
# this gives: kname, smem bytes, threads_per_cta, loop_step
kernel_traits = [traits.split() for traits in output.splitlines()]
cubin_header = get_cubin_header(kernel_traits, valid_specs_names)
# Use new function to generate both fmha_cubin.h and fmha_cubin.cpp files
# To switch back to old behavior, replace get_cubin_header_and_source with get_cubin_header
cubin_header, cubin_source = get_cubin_header(kernel_traits,
valid_specs_names)
if generate_cu_trtllm:
cubin_header = modify_cubin_header(cubin_header)
cubin_source = modify_cubin_header(cubin_source)
# Write fmha_cubin.h file
with open('./generated/fmha_cubin.h', 'w') as f:
f.write(cubin_header)
# Write fmha_cubin.cpp file (same directory as fmha_cubin.h file)
if cubin_source is not None:
with open('./generated/fmha_cubin.cpp', 'w') as f:
f.write(cubin_source)
def enumerate_hgmma_tma_kernels(specs, sm=90):
specs.append(

View File

@ -1,18 +1,18 @@
# ##################################################################################################
# Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Redistribution and use in source and binary forms, with or without modification, are not permit-
# ted.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFIT;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ##################################################################################################
# #################################################################################################

View File

@ -32,20 +32,20 @@ dtype2traits = {
fmha_dgrad_v2_flash_attention_template = '''\
/***************************************************************************************************
* Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Redistribution and use in source and binary forms, with or without modification, are not permit-
* ted.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**************************************************************************************************/
#include "fused_multihead_attention_fprop.h"
@ -157,20 +157,20 @@ void run_fmha_dgrad_v2_flash_attention_{dtype}_S_{head_size}_sm{sm}(
fmha_fprop_v2_flash_attention_template = '''\
/***************************************************************************************************
* Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Redistribution and use in source and binary forms, with or without modification, are not permit-
* ted.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**************************************************************************************************/
#include "fused_multihead_attention_fprop.h"

View File

@ -127,7 +127,9 @@ TEMPLATE_PROLOGUE = '''/*
*/
#pragma once
namespace tensorrt_llm {
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace kernels {
'''
@ -136,7 +138,8 @@ inline constexpr const char* {fname_var_name} = "{fname}";
'''
TEMPLATE_EPILOGUE = '''}
}
TRTLLM_NAMESPACE_END
'''
D = defaultdict(list)

View File

@ -86,8 +86,10 @@ cpp_file_prefix_text = R"""/*
* See the License for the specific language governing permissions and
* limitations under the License.
*/
namespace tensorrt_llm
{
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
// clang-format off
@ -96,7 +98,7 @@ namespace kernels
cpp_file_suffex_text = R"""
// clang-format on
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END
"""
cubin_meta_info_struct_prefix_text = R"""

View File

@ -466,20 +466,53 @@ using WarpAcc = WarpAccT<warpTile.y, warpTile.x>;
#define MMAS_N_PER_MASK 2
__device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskType const* mask, uint32_t rowOffset,
uint32_t nbValidCols, uint32_t qSeqLen, uint32_t actualQSeqLen, uint32_t headGrpSize)
uint32_t nbValidCols, uint32_t qSeqLen, uint32_t actualQSeqLen, uint32_t headGrpSize
#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE
,
int32_t tok0WinBeg, uint32_t seqIter, uint32_t const cacheSeqLen, uint32_t const warpTileTokenBeg
#endif
)
{
uint32_t const idxInQuad = laneId() % 4;
uint32_t const idxQuad = laneId() / 4;
// Packed mask is aligned with 32 bits (2 uint16_t).
uint32_t const nbPackedMasksPerRow = divUp(qSeqLen, 32u) * 2u;
uint16_t const* uint16Mask = reinterpret_cast<uint16_t const*>(mask);
constexpr uint64_t fullMask = ~uint64_t{0};
#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE
Range const tileRange = {warpTileTokenBeg, warpTileTokenBeg + warpTile.x};
Range const maxMaskOutRange = {0, mha::max(0, tok0WinBeg) + (nbValidRows / MMAS_N_PER_MASK - 1)};
bool const ctaNeedBegMask = tileRange.beg < maxMaskOutRange.end;
assert(ctaNeedBegMask == overlap(tileRange, maxMaskOutRange));
int32_t const tok0NbMaskOut = int32_t(tok0WinBeg) - int32_t(warpTileTokenBeg);
uint32_t const nbSeqItersWithoutSpecDecMask = (cacheSeqLen - actualQSeqLen) / ctaTile.x;
bool const ctaNeedSpecDecMask = (seqIter >= nbSeqItersWithoutSpecDecMask);
#else
constexpr bool ctaNeedBegMask = false;
bool const ctaNeedSpecDecMask = true;
int32_t const tok0NbMaskOut = -2147483648;
#endif
bool const needMask = ctaNeedBegMask || ctaNeedSpecDecMask;
if (!needMask)
{
return;
}
#pragma unroll
for (uint32_t m = 0; m < acc.rows; m++)
{
#pragma unroll
for (uint32_t i = 0; i < InstAcc::rows; i++)
{
uint32_t const tokenRow = min((rowOffset + instM * m + idxQuad + i * 8) / headGrpSize, actualQSeqLen - 1);
uint32_t const idxQTokInCta = (rowOffset + instM * m + idxQuad + i * 8) / headGrpSize;
uint32_t const tokenRow = min(idxQTokInCta, actualQSeqLen - 1);
#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE
int32_t const begNbMaskOut = tok0NbMaskOut + int32_t(idxQTokInCta);
uint64_t const begMask = (begNbMaskOut > 0 ? fullMask << begNbMaskOut : fullMask);
#else
uint64_t const begMask = fullMask;
#endif
#pragma unroll
for (uint32_t mask_n = 0; mask_n < acc.cols / MMAS_N_PER_MASK; mask_n++)
{
@ -491,12 +524,15 @@ __device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskTy
uint32_t const maskPos1 = lastCol + actualQSeqLen < nbValidCols
? 0u
: min(lastCol + actualQSeqLen - nbValidCols, actualQSeqLen - 1);
uint32_t packedMask = 0u;
uint32_t const maskPosStart = (maskPos0 / 16) * 16;
reinterpret_cast<uint16_t*>(&packedMask)[0]
= uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos0 / 16)];
reinterpret_cast<uint16_t*>(&packedMask)[1]
= uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos1 / 16)];
uint32_t packedMask = ~uint32_t{0};
if (ctaNeedSpecDecMask)
{
reinterpret_cast<uint16_t*>(&packedMask)[0]
= uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos0 / 16)];
reinterpret_cast<uint16_t*>(&packedMask)[1]
= uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos1 / 16)];
}
#pragma unroll
for (uint32_t nj = 0; nj < MMAS_N_PER_MASK; nj++)
{
@ -510,7 +546,11 @@ __device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskTy
bool const maskFlag = col + actualQSeqLen < nbValidCols
? true
: packedMask & (1u << ((col + actualQSeqLen - nbValidCols) - maskPosStart));
acc(m, n)(i, j) = maskFlag && col < nbValidCols ? acc(m, n)(i, j) : safeInitRowMax;
bool const begMaskFlag = ctaNeedBegMask ? (begMask & (1ULL << col)) : true;
acc(m, n)(i, j)
= maskFlag && begMaskFlag && col < nbValidCols ? acc(m, n)(i, j) : safeInitRowMax;
}
}
}
@ -1611,8 +1651,14 @@ CUBIN_EXPORT __global__
#endif
uint32_t const cacheSeqLen = getCacheSeqLen<usePagedKVCache>(cacheList, idxReq);
#if SLIDING_WINDOW
#if SLIDING_WINDOW && SPEC_DEC && !IS_SPEC_DEC_TREE
uint32_t const tok0SeqLen = cacheSeqLen - actualQSeqLen + 1 + idxHeadTokenInGrp; // ctaTokOffset;
int32_t const tok0WinBeg = int32_t(tok0SeqLen) - int32_t(slidingWinSize);
uint32_t const nbTotalSkipTokens = mha::max(0, tok0WinBeg);
#elif SLIDING_WINDOW
bool const rtIsReallySliding = (cacheSeqLen > slidingWinSize);
assert(!SPEC_DEC || !rtIsReallySliding);
uint32_t const nbTotalSkipTokens = rtIsReallySliding ? cacheSeqLen - slidingWinSize : 0;
#else
constexpr bool rtIsReallySliding = false;
@ -1626,7 +1672,9 @@ CUBIN_EXPORT __global__
#endif
uint32_t const nbSeqIters = useKVCache ? divUp(cacheSeqLen, ctaTile.x) : 0;
#if SPEC_DEC
#if SLIDING_WINDOW && SPEC_DEC && !IS_SPEC_DEC_TREE
uint32_t const nbSeqItersWithoutMask = nbSkipLeadingTiles;
#elif SPEC_DEC
uint32_t const nbSeqItersWithoutMask = (cacheSeqLen - actualQSeqLen) / ctaTile.x;
#endif
@ -1912,8 +1960,12 @@ CUBIN_EXPORT __global__
if (seqIter >= nbSeqItersWithoutMask)
{
uint32_t const nbValidCols = (warpTileTokenBeg < cacheSeqLen ? cacheSeqLen - warpTileTokenBeg : 0U);
applyMaskFromInput(
warp, acc, mask, idxHeadTokenInGrp, nbValidCols, qSeqLen, actualQSeqLen, headGrpSize);
applyMaskFromInput(warp, acc, mask, idxHeadTokenInGrp, nbValidCols, qSeqLen, actualQSeqLen, headGrpSize
#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE
,
tok0WinBeg, seqIter, cacheSeqLen, warpTileTokenBeg
#endif
);
}
#else
bool const isFirstIter = (seqIter == nbSkipLeadingTiles);

View File

@ -247,7 +247,8 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
{
break;
}
else if (req->isGenerationInProgressState())
if (req->isGenerationInProgressState())
{
scheduledRequests.emplace_back(req);
reservedBlocks.decrementReservedBlocks(*req);
@ -296,7 +297,8 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
{
break;
}
else if (req->isContextInitState() || req->isDisaggGenerationInitState())
if (req->isContextInitState() || req->isDisaggGenerationInitState())
{
bool enoughBlocks = reservedBlocks.enoughAvailableBlocks(*req);
bool enoughCrossBlocks

View File

@ -360,6 +360,12 @@ public:
RequestInfo info;
auto const* connection = isAgent ? agentConnectionManager->recvConnectionAndRequestInfo(info)
: mManager->recvConnect(DataContext{TransceiverTag::kID_TAG}, &id, sizeof(id));
if (connection == nullptr && !mManager->isRunning())
{
TLLM_LOG_WARNING(" recvRequestInfo connection is nullptr, maybe the server is terminating");
return info;
}
if (!isAgent)
{
TLLM_CHECK(id == TransceiverTag::Id::REQUEST_SEND);
@ -616,6 +622,10 @@ private:
if (!mReadyResponses.empty())
{
auto const& requestInfo = recvRequestInfo();
if (mTerminate || !mManager->isRunning())
{
return;
}
auto reqId = requestInfo.getRequestId();
{

View File

@ -102,7 +102,7 @@ void KVCacheEventManager::enqueueStoredEvent(std::vector<BlockPtr> const& blocks
for (auto const& block : blocks)
{
data.blocks.emplace_back(block->getHash(), block->getUniqueTokens(), block->getBlockKey().loraTaskId,
block->isPrimary() ? kPrimaryLevel : kSecondaryLevel, block->getPriority());
block->isPrimary() ? kPrimaryLevel : kSecondaryLevel, block->getPriority(), block->getExtraKeys());
}
enqueueEvent({mEventId++, data, windowSize, mAttentionDpRank});

View File

@ -284,6 +284,11 @@ tk::KVCacheIndex::UnderlyingType KVCacheBlock::getMemoryPoolBlockIndex() const
return mMemoryPoolBlockIndex.get();
}
std::vector<MmKey> KVCacheBlock::getExtraKeys() const
{
return mBlockKey.extraKeys;
}
bool KVCacheBlock::isPrimary() const
{
return mMemoryPoolBlockIndex.isPrimary();

View File

@ -27,7 +27,7 @@ bool initCheckDebug()
}
} // namespace
bool DebugConfig::isCheckDebugEnabled()
bool tensorrt_llm::DebugConfig::isCheckDebugEnabled()
{
static bool const debugEnabled = initCheckDebug();
return debugEnabled;

View File

@ -16,6 +16,7 @@
*/
#include "attentionOp.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/envUtils.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/memoryUtils.h"

View File

@ -16,6 +16,7 @@
*/
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cublasMMWrapper.h"
#include "tensorrt_llm/common/opUtils.h"
#include "tensorrt_llm/common/quantization.h"
@ -36,7 +37,9 @@
#include <nccl.h>
#endif // ENABLE_MULTI_DEVICE
namespace tensorrt_llm::common::op
TRTLLM_NAMESPACE_BEGIN
namespace common::op
{
class AttentionOp
@ -543,4 +546,6 @@ private:
UniqPtrWNullCopy<int32_t[], Deleter> mMultiBlockSemaphores = {};
};
} // namespace tensorrt_llm::common::op
} // namespace common::op
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#include "tensorrt_llm/common/cublasMMWrapper.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cublasVersionCheck.h"
#include <algorithm>
#include <unordered_map>
@ -24,8 +25,8 @@
#error CUDART_VERSION Undefined!
#endif
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -661,4 +662,4 @@ void CublasMMWrapper::BlockScaleGemm(cublasOperation_t transa, cublasOperation_t
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include <cublasLt.h>
#include <cublas_v2.h>
@ -24,8 +25,8 @@
#include <optional>
#include <string>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -185,4 +186,4 @@ public:
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,12 +16,13 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -291,7 +292,8 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _
#endif // ENABLE_BF16
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END
// Operator definitions intentionally in global namespace
namespace

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include <assert.h>
#include <cstdlib>
@ -28,8 +29,8 @@
#include <string>
#include <type_traits>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
static __host__ __device__ int hash(int val)
@ -673,4 +674,5 @@ struct MultiProducerCircularBuffer : public CircularBuffer<DEPTH, CTAS_PER_CGA>
};
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -18,6 +18,7 @@
#if defined(_WIN32)
#include <windows.h>
#define dllOpen(name) LoadLibrary("nv" name ".dll")
#define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
#define dllGetSym(handle, name) static_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name))
@ -29,6 +30,7 @@
#endif // defined(_WIN32)
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaDriverWrapper.h"
#include "tensorrt_llm/common/logger.h"
#include <cuda.h>
@ -36,7 +38,9 @@
#include <cstdio>
#include <mutex>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
std::shared_ptr<CUDADriverWrapper> CUDADriverWrapper::getInstance()
@ -295,4 +299,6 @@ CUresult CUDADriverWrapper::cuOccupancyMaxActiveClusters(
return (*_cuOccupancyMaxActiveClusters)(maxActiveClusters, f, config);
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -17,6 +17,7 @@
#ifndef CUDA_DRIVER_WRAPPER_H
#define CUDA_DRIVER_WRAPPER_H
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/stringUtils.h"
#include "tensorrt_llm/common/tllmException.h"
@ -25,7 +26,9 @@
#include <cstdio>
#include <memory>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
class CUDADriverWrapper
@ -165,8 +168,9 @@ void checkDriverExitSafe(T result, char const* const func, char const* const fil
}
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END
/*
* Macros compliant with TensorRT coding conventions
*/

View File

@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaFp8Utils.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/envUtils.h"
@ -24,8 +25,8 @@
#include <limits>
#include <type_traits>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
#ifdef ENABLE_FP8
@ -466,4 +467,5 @@ DEFINE_INVOKE_QUANTIZE_MATRIX(__nv_bfloat16, float, __nv_fp8_e4m3);
#endif // ENABLE_FP8
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/
#include "tensorrt_llm/common/cudaProfilerUtils.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/stringUtils.h"
#include <cstdint>
@ -54,7 +55,9 @@ std::tuple<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIte
} // namespace
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
@ -81,4 +84,6 @@ std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIter
return std::make_pair(profileIterIdxs, stopIterIdxs);
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -25,9 +25,10 @@
#if ENABLE_BF16
#include <cuda_bf16.h>
#endif
#include "tensorrt_llm/common/config.h"
TRTLLM_NAMESPACE_BEGIN
namespace tensorrt_llm
{
namespace common
{
@ -749,4 +750,5 @@ __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val)
#endif // ENABLE_FP8
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/envUtils.h"
#include "tensorrt_llm/kernels/customAllReduceKernels.h"
@ -25,7 +26,9 @@
using tensorrt_llm::kernels::AllReduceFusionOp;
using tensorrt_llm::kernels::AllReduceStrategyType;
namespace tensorrt_llm::utils::customAllReduceUtils
TRTLLM_NAMESPACE_BEGIN
namespace utils::customAllReduceUtils
{
constexpr size_t NUM_POINTERS_PER_RANK = 7;
@ -292,4 +295,6 @@ inline const std::unordered_map<int, AllReduceBestStrategyTableType> AllReduceBe
{90, AllReduceBestStrategyTableSM90},
{100, AllReduceBestStrategyTableSM100},
};
} // namespace tensorrt_llm::utils::customAllReduceUtils
} // namespace utils::customAllReduceUtils
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
*/
#include "envUtils.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/stringUtils.h"
@ -25,7 +26,9 @@
#include <optional>
#include <string>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
std::optional<int32_t> getIntEnv(char const* name)
@ -528,4 +531,6 @@ bool getEnvEplbForceGdrcopy()
return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY");
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,13 +16,16 @@
*/
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include <cstdint>
#include <cuda_runtime.h>
#include <optional>
#include <string>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
// Useful when you want to inject some debug code controllable with env var.
std::optional<int32_t> getIntEnv(char const* name);
@ -153,4 +156,6 @@ bool getEnvKVCacheTransferAllBlocksForWindow();
bool getEnvEplbForceGdrcopy();
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -19,6 +19,7 @@
#ifndef TRTLLM_CUDA_LAMPORT_UTILS_CUH
#define TRTLLM_CUDA_LAMPORT_UTILS_CUH
#include "tensorrt_llm/common/config.h"
#include <array>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
@ -29,7 +30,9 @@
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
constexpr uint16_t kNEGZERO_FP16 = 0x8000U;
@ -279,6 +282,7 @@ private:
}
};
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END
#endif // TRTLLM_CUDA_LAMPORT_UTILS_CUH

View File

@ -15,12 +15,15 @@
*/
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/stringUtils.h"
#include "tensorrt_llm/common/tllmException.h"
#include <cuda_runtime.h>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
Logger::Logger()
@ -70,4 +73,6 @@ Logger* Logger::getLogger()
thread_local Logger instance;
return &instance;
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -16,10 +16,11 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include <cuda_runtime.h>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -34,4 +35,5 @@ inline __device__ __host__ T divUp(T m, T n)
////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -14,11 +14,15 @@
* limitations under the License.
*/
#include "mcastDevMemUtils.h"
#include "tensorrt_llm/common/config.h"
#include <unordered_map>
namespace tensorrt_llm::common
using McastDeviceMemory = ::tensorrt_llm::runtime::McastDeviceMemory;
TRTLLM_NAMESPACE_BEGIN
namespace common
{
using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
namespace
{
@ -84,4 +88,6 @@ McastDeviceMemory* findMcastDevMemBuffer(void* ptr)
{
return McastDevMemBufferRegistry::getInstance().findBuffer(ptr);
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -15,13 +15,17 @@
*/
#pragma once
// Avoid circular dependency
#include "tensorrt_llm/common/config.h"
namespace tensorrt_llm::runtime
{
class McastDeviceMemory;
}
} // namespace tensorrt_llm::runtime
namespace tensorrt_llm::common
// Avoid circular dependency
TRTLLM_NAMESPACE_BEGIN
namespace common
{
using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
// Register a buffer with the McastDeviceMemory class. This function does not check if the ptr belongs to the buffer!
@ -31,4 +35,6 @@ void unregisterMcastDevMemBuffer(McastDeviceMemory* buf);
// information. Thus a derived pointer cannot used as the key.
McastDeviceMemory* findMcastDevMemBuffer(void* ptr);
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/memoryUtils.h"
@ -25,8 +26,8 @@
#include <sanitizer/asan_interface.h>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -961,4 +962,5 @@ void calcAlignedPointers(
}
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,13 +16,14 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaFp8Utils.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include <cassert>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -293,4 +294,5 @@ AlignedPointersUnpacker inline calcAlignedPointers(
}
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/logger.h"
@ -46,7 +47,9 @@
#include <dlfcn.h>
#endif
namespace tensorrt_llm::common::nccl_util
TRTLLM_NAMESPACE_BEGIN
namespace common::nccl_util
{
//==============================================================================
@ -392,6 +395,8 @@ inline std::pair<torch::Tensor, NCCLWindowBuffer> createNCCLWindowTensor(
return std::make_pair(tensor, buffer);
}
} // namespace tensorrt_llm::common::nccl_util
} // namespace common::nccl_util
TRTLLM_NAMESPACE_END
#endif // ENABLE_MULTI_DEVICE

View File

@ -25,10 +25,13 @@
#if defined(__clang__)
#pragma clang diagnostic pop
#endif
#include "tensorrt_llm/common/config.h"
#include <array>
namespace tensorrt_llm::common::nvtx
TRTLLM_NAMESPACE_BEGIN
namespace common::nvtx
{
inline nvtx3::color nextColor()
{
@ -46,8 +49,9 @@ inline nvtx3::color nextColor()
#endif
}
} // namespace tensorrt_llm::common::nvtx
} // namespace common::nvtx
TRTLLM_NAMESPACE_END
#define NVTX3_SCOPED_RANGE_WITH_NAME(range, name) \
::nvtx3::scoped_range range(::tensorrt_llm::common::nvtx::nextColor(), name)
#define NVTX3_SCOPED_RANGE(range) NVTX3_SCOPED_RANGE_WITH_NAME(range##_range, #range)

View File

@ -29,6 +29,7 @@
#include <mutex>
#include <thread>
TRTLLM_NAMESPACE_BEGIN
#if ENABLE_MULTI_DEVICE
std::unordered_map<nvinfer1::DataType, ncclDataType_t>* getDtypeMap()
@ -378,3 +379,5 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
});
return creator();
}
TRTLLM_NAMESPACE_END

View File

@ -17,6 +17,7 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cublasMMWrapper.h"
#include "tensorrt_llm/common/workspace.h"
@ -37,7 +38,9 @@
#include <string>
#include <unordered_map>
namespace tensorrt_llm::common::op
TRTLLM_NAMESPACE_BEGIN
namespace common::op
{
// Write values into buffer
@ -178,7 +181,7 @@ struct hash
// for testing only
void const* getCommSessionHandle();
} // namespace tensorrt_llm::common::op
} // namespace common::op
inline bool isBuilding()
{
@ -220,6 +223,8 @@ std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group);
std::shared_ptr<cublasHandle_t> getCublasHandle();
std::shared_ptr<cublasLtHandle_t> getCublasLtHandle();
TRTLLM_NAMESPACE_END
#ifndef DEBUG
#define PLUGIN_CHECK(status) \

View File

@ -16,14 +16,15 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
#include "tensorrt_llm/common/cudaFp8Utils.h"
#include <cuda.h>
#include <cuda_fp16.h>
#include <float.h>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -52,4 +53,5 @@ struct QuantTypeStaticVals<__nv_fp8_e4m3>
#endif // ENABLE_FP8
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -21,6 +21,7 @@
#else
#include <cooperative_groups.h>
#endif
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
@ -30,8 +31,8 @@
namespace cg = cooperative_groups;
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace common
{
@ -423,4 +424,5 @@ __device__ __forceinline__ half clamp_inf_for_half(float const input)
}
} // namespace common
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -17,6 +17,7 @@
#include "safetensors.h"
#include "nlohmann/json.hpp"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include <NvInferRuntime.h>
#include <cstdint>
#include <fstream>
@ -25,7 +26,9 @@
#include <utility>
#include <vector>
namespace tensorrt_llm::common::safetensors
TRTLLM_NAMESPACE_BEGIN
namespace common::safetensors
{
using nvinfer1::DataType;
@ -164,4 +167,6 @@ std::shared_ptr<ISafeTensor> ISafeTensor::open(char const* filename)
{
return std::make_shared<SafeTensor>(filename);
}
} // namespace tensorrt_llm::common::safetensors
} // namespace common::safetensors
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/logger.h"
#include <NvInferRuntime.h>
#include <cstdint>
@ -23,7 +24,9 @@
#include <memory>
#include <utility>
namespace tensorrt_llm::common::safetensors
TRTLLM_NAMESPACE_BEGIN
namespace common::safetensors
{
class INdArray
{
@ -58,4 +61,6 @@ public:
virtual ~ISafeTensor() = default;
};
} // namespace tensorrt_llm::common::safetensors
} // namespace common::safetensors
TRTLLM_NAMESPACE_END

View File

@ -16,12 +16,15 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include <functional>
#include <numeric>
#include <optional>
#include <sstream>
namespace tensorrt_llm::common::stl_utils
TRTLLM_NAMESPACE_BEGIN
namespace common::stl_utils
{
template <typename TInputIt, typename TOutputIt, typename TBinOp>
@ -120,4 +123,6 @@ std::string toString(std::optional<T> const& t, typename std::enable_if_t<HasOpe
return oss.str();
}
} // namespace tensorrt_llm::common::stl_utils
} // namespace common::stl_utils
TRTLLM_NAMESPACE_END

View File

@ -16,6 +16,7 @@
#include "tensorrt_llm/common/stringUtils.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include <cerrno>
#include <cstdarg>
@ -23,7 +24,9 @@
#include <iostream>
#include <string>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args)
@ -73,4 +76,6 @@ std::unordered_set<std::string> str2set(std::string const& input, char delimiter
return values;
};
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -14,13 +14,16 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include <chrono>
#include <iomanip>
#include <sstream>
#include "tensorrt_llm/common/timestampUtils.h"
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
std::string getCurrentTimestamp()
@ -39,4 +42,6 @@ std::string getCurrentTimestamp()
return stream.str();
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -14,12 +14,17 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include <string>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
/// @brief Get the current timestamp in the format "MM-DD-YYYY HH:MM:SS:uuuuuu"
std::string getCurrentTimestamp();
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/
#include "tensorrt_llm/common/tllmException.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/stringUtils.h"
#include <cinttypes>
@ -26,7 +27,9 @@
#endif
#include <sstream>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
namespace
@ -128,4 +131,6 @@ RequestErrorCode RequestSpecificException::getErrorCode() const noexcept
return mErrorCode;
}
} // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -14,10 +14,13 @@
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/common/config.h"
#include <cstddef>
#include <cstdint>
namespace tensorrt_llm::common
TRTLLM_NAMESPACE_BEGIN
namespace common
{
// CuBLAS >= 12.9.1 requires 256-byte alignment.
@ -85,4 +88,6 @@ inline size_t calculateTotalWorkspaceSize(
return total;
}
}; // namespace tensorrt_llm::common
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -18,10 +18,11 @@
#include <cuda_runtime_api.h>
#include "cutlass/device_kernel.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace cutlass_extensions
{
@ -85,4 +86,5 @@ inline int compute_occupancy_for_kernel()
}
} // namespace cutlass_extensions
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -30,10 +30,11 @@
#include "cutlass/epilogue/thread/linear_combination_relu.h"
#include "cutlass/epilogue/thread/linear_combination_silu.h"
#include "cutlass_extensions/epilogue/thread/fused_activations.h"
#include "tensorrt_llm/common/config.h"
#include <cutlass/epilogue/fusion/operations.hpp>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace cutlass_extensions
{
@ -150,4 +151,5 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
};
} // namespace cutlass_extensions
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -24,10 +24,11 @@
#include "cute/tensor.hpp"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/tllmException.h"
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace cutlass_extensions
{
@ -535,4 +536,5 @@ inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& conf
}
} // namespace cutlass_extensions
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -319,6 +319,10 @@ AgentConnection const* AgentConnectionManager::recvConnectionAndRequestInfo(batc
{
while (true)
{
if (!mIsRunning)
{
return nullptr;
}
updateUnhandledNotifications();
std::scoped_lock lock(mNotificationMutex);
auto it = mUnhandledNotifications.begin();
@ -491,6 +495,11 @@ void AgentConnectionManager::waitForNotification(std::string const& remoteAgentN
while (true)
{
if (!mIsRunning)
{
return;
}
updateUnhandledNotifications();
std::scoped_lock lock(mNotificationMutex);
auto it = mUnhandledNotifications.begin();
@ -587,6 +596,13 @@ std::string const& AgentConnectionManager::getAgentName() const
AgentConnectionManager::~AgentConnectionManager()
{
mIsRunning = false;
m_Agent->deregisterMemory(mRegMemDescs);
}
bool AgentConnectionManager::isRunning() const
{
return mIsRunning;
}
} // namespace tensorrt_llm::executor::kv_cache

View File

@ -296,6 +296,7 @@ public:
void waitForNotification(std::string const& remoteAgentName, NotificationType& expectedInfo);
void waitForSyncInfo(std::string const& remoteAgentName, NotificationSyncInfo& syncInfo);
void waitForReadySignal(std::string const& remoteAgentName, ReadySignalInfo& readySignalInfo);
[[nodiscard]] bool isRunning() const override;
private:
std::map<std::string, std::shared_ptr<AgentConnection>> mConnections;
@ -309,6 +310,7 @@ private:
int mDeviceId;
std::string mAgentName;
MemoryDescs mRegMemDescs;
std::atomic<bool> mIsRunning{true};
};
} // namespace tensorrt_llm::executor::kv_cache

View File

@ -77,4 +77,13 @@ CommState const& MpiConnectionManager::getCommState() const
return mCommState;
}
bool MpiConnectionManager::isRunning() const
{
return mIsRunning;
}
MpiConnectionManager::~MpiConnectionManager()
{
mIsRunning = false;
}
} // namespace tensorrt_llm::executor::kv_cache

View File

@ -42,14 +42,17 @@ class MpiConnectionManager : public ConnectionManager
{
public:
MpiConnectionManager(mpi::MpiComm const* comm);
~MpiConnectionManager();
MpiConnection const* recvConnect(DataContext const& ctx, void* data, size_t size) override;
[[nodiscard]] std::vector<Connection const*> getConnections(CommState const& state) override;
[[nodiscard]] CommState const& getCommState() const override;
[[nodiscard]] bool isRunning() const override;
private:
mpi::MpiComm const* mComm;
std::map<int, MpiConnection> mConnections;
CommState mCommState;
std::atomic<bool> mIsRunning{true};
};
} // namespace tensorrt_llm::executor::kv_cache

View File

@ -454,21 +454,10 @@ void NixlTransferAgent::invalidateRemoteAgent(std::string const& name)
void NixlTransferAgent::notifySyncMessage(std::string const& name, SyncMessage const& syncMessage)
{
if (name == mName)
{
// FIXME: nixl does not support gen notif to itself ,but support local transfer. we use local transfer to notify
// itself
MemoryDescs descs{MemoryType::kDRAM, {MemoryDesc{mDRamSrcBuffer}, MemoryDesc{mDRamDstBuffer}}};
TransferRequest request{TransferOp::kWRITE, descs, descs, name, syncMessage};
auto request_status = submitTransferRequests(request);
request_status->wait();
}
else
{
auto status = mRawAgent->genNotif(name, syncMessage);
TLLM_CHECK_WITH_INFO(
status == NIXL_SUCCESS, "genNotif failed with status: %s", nixlEnumStrings::statusStr(status).c_str());
}
auto status = mRawAgent->genNotif(name, syncMessage);
TLLM_CHECK_WITH_INFO(
status == NIXL_SUCCESS, "genNotif failed with status: %s", nixlEnumStrings::statusStr(status).c_str());
}
[[nodiscard]] std::unordered_map<std::string, std::vector<SyncMessage>> NixlTransferAgent::getNotifiedSyncMessages()

View File

@ -504,7 +504,7 @@ UcxConnectionManager::~UcxConnectionManager()
socket.close();
mZmqRepThread.join();
}
mIsRunning = false;
mZmqRepSocket.close();
mZmqContext.close();
@ -673,6 +673,11 @@ std::vector<Connection const*> UcxConnectionManager::getConnections(CommState co
return ret;
}
bool UcxConnectionManager::isRunning() const
{
return mIsRunning;
}
CommState const& UcxConnectionManager::getCommState() const
{
return mCommState;

View File

@ -62,6 +62,7 @@ private:
zmq::socket_t mZmqRepSocket;
std::string mZmqRepEndpoint;
std::thread mZmqRepThread;
std::atomic<bool> mIsRunning{true};
UcxConnection::ConnectionIdType getNewConnectionId(std::shared_ptr<ucxx::Endpoint> const& newEp);
UcxConnection::ConnectionIdType addConnection(std::string const& ip, uint16_t port);
@ -85,6 +86,8 @@ public:
{
return mRank;
}
[[nodiscard]] bool isRunning() const override;
};
#if defined(__clang__)

View File

@ -52,7 +52,8 @@ namespace tensorrt_llm::executor
namespace
{
[[nodiscard]] bool executorConfigIsValid(ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
[[nodiscard]] bool executorConfigIsValid(
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
{
// Make sure logic in this function matches fixExecutorConfig
if (executorConfig.getEnableChunkedContext())
@ -65,8 +66,8 @@ namespace
return true;
}
[[nodiscard]] ExecutorConfig fixExecutorConfig(
ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
[[nodiscard]] ::tensorrt_llm::executor::ExecutorConfig fixExecutorConfig(
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
{
// Make sure logic in this function matches executorConfigIsValid
auto fixedExecutorConfig = executorConfig;
@ -241,7 +242,7 @@ private:
void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& modelPathOpt,
std::optional<BufferView> const& engineBufferOpt, runtime::GptJsonConfig const& jsonConfig,
ExecutorConfig const& executorConfig, bool isEncoder,
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, bool isEncoder,
std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
{
auto const gpusPerNode = jsonConfig.getGpusPerNode();
@ -288,7 +289,7 @@ void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& model
Executor::Impl::Impl(std::filesystem::path const& modelPath,
std::optional<std::filesystem::path> const& encoderModelPath, ModelType const modelType,
ExecutorConfig const& executorConfig)
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{
auto decoderJsonConfig = runtime::GptJsonConfig::parse(modelPath / "config.json");
@ -329,7 +330,7 @@ Executor::Impl::Impl(std::filesystem::path const& modelPath,
Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& jsonConfigStr,
std::optional<BufferView> const& encoderEngineBufferView, std::optional<std::string> const& encoderJsonConfigStr,
ModelType const modelType, ExecutorConfig const& executorConfig,
ModelType const modelType, ::tensorrt_llm::executor::ExecutorConfig const& executorConfig,
std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
{
auto decoderJsonConfig = runtime::GptJsonConfig::parse(jsonConfigStr);
@ -367,7 +368,7 @@ Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& json
}
Executor::Impl::Impl(std::shared_ptr<Model> model, std::optional<std::shared_ptr<Model>> encoderModel,
ExecutorConfig const& executorConfig)
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{
auto const& worldConfig = model->getWorldConfig();
auto const tp = worldConfig.getTensorParallelism();
@ -388,7 +389,7 @@ Executor::Impl::~Impl()
shutdown();
}
void Executor::Impl::initialize(ExecutorConfig const& executorConfig)
void Executor::Impl::initialize(::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
@ -484,7 +485,7 @@ void Executor::Impl::initialize(ExecutorConfig const& executorConfig)
std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& rawEngine,
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
ExecutorConfig const& executorConfig)
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{
auto const gptModelType = [&executorConfig, &modelConfig]()
{
@ -512,7 +513,7 @@ std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& raw
std::shared_ptr<Model> Executor::Impl::createEncoderModel(runtime::RawEngine const& rawEngine,
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
ExecutorConfig const& executorConfig)
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
{
auto fixedExecutorConfig = ExecutorConfig{};
fixedExecutorConfig.setSchedulerConfig(executorConfig.getSchedulerConfig());
@ -579,7 +580,7 @@ void Executor::Impl::setOrchLeaderComm(
}
void Executor::Impl::initializeCommAndWorkers(SizeType32 tp, SizeType32 pp, SizeType32 cp,
ExecutorConfig const& executorConfig, std::optional<ModelType> modelType,
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, std::optional<ModelType> modelType,
std::optional<std::filesystem::path> const& modelPath, std::optional<runtime::WorldConfig> const& worldConfig,
std::optional<runtime::GptJsonConfig> const& decoderGptJsonConfig)
{
@ -638,7 +639,7 @@ void Executor::Impl::validateParallelConfig(ParallelConfig const& parallelConfig
}
void Executor::Impl::initializeOrchestrator(SizeType32 tp, SizeType32 pp, SizeType32 cp,
ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType,
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType,
std::filesystem::path const& modelPath)
{
#if ENABLE_MULTI_DEVICE

View File

@ -2333,6 +2333,7 @@ size_t Serialization::serializedSize(KVCacheStoredBlockData const& data)
totalSize += su::serializedSize(data.loraId);
totalSize += su::serializedSize(data.cacheLevel);
totalSize += su::serializedSize(data.priority);
totalSize += su::serializedSize(data.mmKeys);
return totalSize;
}
@ -2343,6 +2344,7 @@ void Serialization::serialize(KVCacheStoredBlockData const& data, std::ostream&
su::serialize(data.loraId, os);
su::serialize(data.cacheLevel, os);
su::serialize(data.priority, os);
su::serialize(data.mmKeys, os);
}
KVCacheStoredBlockData Serialization::deserializeKVCacheStoredBlockData(std::istream& is)
@ -2352,8 +2354,9 @@ KVCacheStoredBlockData Serialization::deserializeKVCacheStoredBlockData(std::ist
auto loraId = su::deserialize<std::optional<tensorrt_llm::runtime::LoraTaskIdType>>(is);
auto cacheLevel = su::deserialize<SizeType32>(is);
auto priority = su::deserialize<SizeType32>(is);
auto mmKeys = su::deserialize<std::vector<tensorrt_llm::batch_manager::kv_cache_manager::MmKey>>(is);
return KVCacheStoredBlockData{blockHash, tokens, loraId, cacheLevel, priority};
return KVCacheStoredBlockData{blockHash, tokens, loraId, cacheLevel, priority, mmKeys};
}
// KVcacheRemovedData

View File

@ -40,9 +40,7 @@ list(FILTER SRC_CU EXCLUDE REGEX "fusedLayernormKernels/.*")
function(filter_cuda_archs ARCH SOURCES_VAR)
if(NOT "${ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
set(FILTER_REGEX
".*_sm(_)?${ARCH}[.]cubin[.]cpp|^.*Sm(_)?${ARCH}.*cubin.cpp$|.*_sm(_)?${ARCH}[.]cu|^.*Sm(_)?${ARCH}.*cu$"
)
set(FILTER_REGEX ".*[Ss][Mm]_?${ARCH}(af)?.*(cubin\.cpp|\.cu)$")
list(APPEND SOURCES ${${SOURCES_VAR}})
list(APPEND SOURCES_FILTERED ${SOURCES})
list(FILTER SOURCES_FILTERED INCLUDE REGEX "${FILTER_REGEX}")

View File

@ -16,9 +16,12 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
namespace tensorrt_llm::kernels
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_scale_bytes, uint8_t* k_cache,
@ -28,3 +31,5 @@ void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_sca
cudaStream_t stream = 0);
}
TRTLLM_NAMESPACE_END

View File

@ -17,12 +17,15 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include "tensorrt_llm/common/cudaUtils.h"
namespace tensorrt_llm::kernels
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux,
int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0,
@ -32,4 +35,6 @@ void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int con
int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048,
cudaStream_t const stream = 0);
} // namespace tensorrt_llm::kernels
} // namespace kernels
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/
#include "attentionMask.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
#include "tensorrt_llm/common/cudaFp8Utils.h"
#include "tensorrt_llm/common/cudaUtils.h"
@ -24,8 +25,8 @@
using namespace tensorrt_llm::common;
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
@ -231,4 +232,5 @@ template void invokeBuildAttentionMask(AttentionMaskParams<__nv_fp8_e4m3> const&
////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -15,6 +15,7 @@
*/
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/gptKernels.h"
#include "tensorrt_llm/runtime/iTensor.h"
@ -25,8 +26,8 @@
namespace tc = tensorrt_llm::common;
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
@ -64,4 +65,5 @@ template <typename MaskDataType>
void invokeBuildAttentionMask(AttentionMaskParams<MaskDataType> const& params, cudaStream_t stream);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -14,14 +14,15 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/banBadWords.h"
using namespace tensorrt_llm::common;
using namespace tensorrt_llm::runtime;
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
@ -130,4 +131,5 @@ template void invokeBanBadWords(float* logits, TokenIdType const** output_ids_pt
SizeType32 const* sequence_lengths, SizeType32 max_seq_len, cudaStream_t stream);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

View File

@ -16,12 +16,13 @@
#pragma once
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/runtime/common.h"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
namespace tensorrt_llm
{
TRTLLM_NAMESPACE_BEGIN
namespace kernels
{
@ -34,4 +35,5 @@ void invokeBanBadWords(T* logits, runtime::TokenIdType const** output_ids_ptr,
cudaStream_t stream);
} // namespace kernels
} // namespace tensorrt_llm
TRTLLM_NAMESPACE_END

Some files were not shown because too many files have changed in this diff Show More