mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-13 14:33:52 +08:00
Merge branch 'main' into update_mnnvl_test
This commit is contained in:
commit
6b6daf31a8
2
.gitattributes
vendored
2
.gitattributes
vendored
@ -12,3 +12,5 @@ tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text
|
||||
docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text
|
||||
docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text
|
||||
docs/source/blogs/media/tech_blog10_context_wait_performance.png filter=lfs diff=lfs merge=lfs -text
|
||||
cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp filter=lfs diff=lfs merge=lfs -text
|
||||
cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp filter=lfs diff=lfs merge=lfs -text
|
||||
|
||||
127
.github/workflows/waiting_for_feedback.yml
vendored
Normal file
127
.github/workflows/waiting_for_feedback.yml
vendored
Normal file
@ -0,0 +1,127 @@
|
||||
name: Manage Waiting for Feedback Label
|
||||
|
||||
on:
|
||||
issue_comment:
|
||||
types: [created]
|
||||
pull_request_review_comment:
|
||||
types: [created]
|
||||
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
manage-waiting-for-feedback:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.repository == 'NVIDIA/TensorRT-LLM'
|
||||
steps:
|
||||
- name: Check membership and manage label
|
||||
uses: actions/github-script@v8
|
||||
with:
|
||||
script: |
|
||||
const commenter = context.payload.comment.user.login;
|
||||
const commenterType = context.payload.comment.user.type;
|
||||
const label = 'waiting for feedback';
|
||||
|
||||
// Ignore bots and CI accounts
|
||||
const ignoredAccounts = ['tensorrt-cicd'];
|
||||
if (commenterType === 'Bot' || ignoredAccounts.includes(commenter)) {
|
||||
console.log(`Ignoring comment from ${commenter} (type: ${commenterType}). Skipping.`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Handle both issue_comment and pull_request_review_comment events
|
||||
// context.issue.number is only available for issue_comment events
|
||||
const issueNumber = context.issue?.number || context.payload.pull_request?.number;
|
||||
const issue = context.payload.issue || context.payload.pull_request;
|
||||
const author = issue?.user?.login;
|
||||
const isAuthor = (commenter === author);
|
||||
|
||||
if (!issueNumber) {
|
||||
console.log('Could not determine issue/PR number. Skipping.');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Comment by ${commenter} on #${issueNumber} (author: ${author})`);
|
||||
const owner = context.repo.owner;
|
||||
const repo = context.repo.repo;
|
||||
|
||||
// Check if commenter is repository member
|
||||
let isMember = false;
|
||||
try {
|
||||
await github.rest.repos.checkCollaborator({
|
||||
owner,
|
||||
repo,
|
||||
username: commenter
|
||||
});
|
||||
isMember = true;
|
||||
} catch (error) {
|
||||
if (error.status === 404) {
|
||||
isMember = false;
|
||||
} else if (error.status === 302) {
|
||||
console.log(`Cannot determine membership for ${commenter} (insufficient token permissions)`);
|
||||
return;
|
||||
} else {
|
||||
console.error(`Error checking membership: ${error.message}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// Logic:
|
||||
// - Author responds → remove label (feedback provided)
|
||||
// - NVIDIA non-author comments → add label (team is waiting for response)
|
||||
// - External non-author comments → remove label (someone provided feedback)
|
||||
|
||||
if (isAuthor) {
|
||||
// Author responded - remove 'waiting for feedback' label
|
||||
console.log(`${commenter} is the author. Removing '${label}' label if present.`);
|
||||
|
||||
try {
|
||||
await github.rest.issues.removeLabel({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: issueNumber,
|
||||
name: label
|
||||
});
|
||||
console.log(`Successfully removed '${label}' label from #${issueNumber}`);
|
||||
} catch (error) {
|
||||
if (error.status === 404) {
|
||||
console.log(`Label '${label}' was not present on #${issueNumber}. No action needed.`);
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (isMember) {
|
||||
// NVIDIA non-author commented - add 'waiting for feedback' label
|
||||
console.log(`${commenter} is an NVIDIA member (not author). Adding '${label}' label.`);
|
||||
|
||||
await github.rest.issues.addLabels({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: issueNumber,
|
||||
labels: [label]
|
||||
});
|
||||
|
||||
console.log(`Successfully added '${label}' label to #${issueNumber}`);
|
||||
|
||||
} else {
|
||||
// External non-author commented - remove 'waiting for feedback' label
|
||||
console.log(`${commenter} is external (not author). Removing '${label}' label if present.`);
|
||||
|
||||
try {
|
||||
await github.rest.issues.removeLabel({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: issueNumber,
|
||||
name: label
|
||||
});
|
||||
console.log(`Successfully removed '${label}' label from #${issueNumber}`);
|
||||
} catch (error) {
|
||||
if (error.status === 404) {
|
||||
console.log(`Label '${label}' was not present on #${issueNumber}. No action needed.`);
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@ -55,6 +55,8 @@ tensorrt_llm/scripts
|
||||
*docs/source/_cpp_gen*
|
||||
docs/source/**/*.rst
|
||||
!docs/source/examples/index.rst
|
||||
!docs/source/deployment-guide/config_table.rst
|
||||
!docs/source/deployment-guide/note_sections.rst
|
||||
*.swp
|
||||
|
||||
# Testing
|
||||
@ -72,6 +74,7 @@ llm-test-workspace/
|
||||
cpp/include/tensorrt_llm/executor/version.h
|
||||
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/
|
||||
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
|
||||
cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.cpp
|
||||
.devcontainer/.env
|
||||
/examples/layer_wise_benchmarks/profiles/
|
||||
|
||||
@ -86,3 +89,6 @@ compile_commands.json
|
||||
# Enroot sqsh files
|
||||
enroot/sw-tensorrt-docker+*.sqsh
|
||||
enroot/tensorrt_llm.devel.sqsh
|
||||
|
||||
# MacOSX Files
|
||||
.DS_Store
|
||||
|
||||
@ -1395,6 +1395,8 @@ repos:
|
||||
- id: check-symlinks
|
||||
- id: detect-private-key
|
||||
- id: end-of-file-fixer
|
||||
exclude: |
|
||||
(?x)^(.*cubin.cpp | .*cubin.h)$
|
||||
- id: check-yaml
|
||||
args: [--allow-multiple-documents]
|
||||
exclude: ".*/gitlab/.*.yml"
|
||||
@ -1439,7 +1441,7 @@ repos:
|
||||
additional_dependencies:
|
||||
- tomli
|
||||
# add ignore words list
|
||||
args: ["-L", "Mor,ans,thirdparty", "--skip", "ATTRIBUTIONS-*.md,*.svg", "--skip", "security_scanning/*"]
|
||||
args: ["-L", "Mor,ans,thirdparty,subtiles", "--skip", "ATTRIBUTIONS-*.md,*.svg", "--skip", "security_scanning/*"]
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.9.4
|
||||
hooks:
|
||||
|
||||
29727
ATTRIBUTIONS-Python.md
29727
ATTRIBUTIONS-Python.md
File diff suppressed because it is too large
Load Diff
@ -8,33 +8,9 @@
|
||||
|
||||
## Coding Guidelines
|
||||
|
||||
* Coding style for TensorRT-LLM can be found [in this document](CODING_GUIDELINES.md).
|
||||
TensorRT-LLM Coding Style can be found [in this document](CODING_GUIDELINES.md).
|
||||
|
||||
* All contributed C++ code should be formatted following the rules in TensorRT-LLM's [clang-format](.clang-format) file. The recommended version is clang-format>=14.0.
|
||||
|
||||
* Changes can be formatted with the following command:
|
||||
|
||||
```bash
|
||||
# Commit ID is optional - if unspecified, run format on staged changes.
|
||||
git-clang-format --style file [commit ID/reference]
|
||||
```
|
||||
|
||||
* All contributed Python code should be formatted using the `black` Python package. The recommended version is `black>=23.0`
|
||||
|
||||
* Changes can be formatted with the following command:
|
||||
|
||||
```bash
|
||||
git diff --name-only | grep "*.py" | xargs black -l 120
|
||||
```
|
||||
|
||||
* Try to keep pull requests (PRs) as concise as possible:
|
||||
* Avoid committing commented-out code.
|
||||
* Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes.
|
||||
|
||||
## Coding Style
|
||||
|
||||
We use `pre-commit` for automatic code formatting and validation. Install the `pre-commit` package in your local
|
||||
Python environment.
|
||||
We use `pre-commit` for automatic code formatting and validation. Install the `pre-commit` package in your local Python environment.
|
||||
|
||||
```bash
|
||||
pip install pre-commit
|
||||
@ -73,6 +49,9 @@ mdformat.................................................................Passed
|
||||
|
||||
If any files were modified by this hook, you will need to stage and commit them again.
|
||||
|
||||
In addition, please try to keep pull requests (PRs) as concise as possible:
|
||||
* Avoid committing commented-out code.
|
||||
* Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes.
|
||||
|
||||
## Pull Requests
|
||||
|
||||
|
||||
134
LICENSE
134
LICENSE
@ -1,7 +1,84 @@
|
||||
Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
Copyright (c) 2011-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
|
||||
Portions of this project are under the following copyright:
|
||||
- Copyright contributors to the vLLM project
|
||||
This project is licensed under the Apache 2.0 license, whose full license text is available below.
|
||||
|
||||
This project contains portions of code that are based on or derived from
|
||||
other open source projects, which may have different licenses whose text
|
||||
is available below.
|
||||
|
||||
All modifications and additions to other projects are licensed under the
|
||||
Apache License 2.0 unless otherwise specified. Please refer to the individual
|
||||
file headers for specific copyright and license information.
|
||||
|
||||
Below is a list of other projects that have portions contained by this project:
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
causal-conv1d
|
||||
--------------------------------------------------------------------------------
|
||||
Original Source: https://github.com/Dao-AILab/causal-conv1d
|
||||
Copyright (c) 2024, Tri Dao.
|
||||
Licensed under the BSD 3-Clause License
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
flash-linear-attention
|
||||
--------------------------------------------------------------------------------
|
||||
Original Source: https://github.com/fla-org/flash-linear-attention
|
||||
Copyright (c) 2023-2025 Songlin Yang
|
||||
Licensed under the MIT License
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
InstructEval
|
||||
--------------------------------------------------------------------------------
|
||||
Original Source: https://github.com/declare-lab/instruct-eval
|
||||
Copyright (c) 2020 Dan Hendrycks
|
||||
Copyright (c) 2023 Deep Cognition and Language Research (DeCLaRe) Lab
|
||||
Licensed under the MIT License
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
Mamba
|
||||
--------------------------------------------------------------------------------
|
||||
Original Source: https://github.com/state-spaces/mamba
|
||||
Copyright 2023 Tri Dao, Albert Gu
|
||||
Licensed under the Apache License 2.0
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
SGLang
|
||||
--------------------------------------------------------------------------------
|
||||
Original Source: https://github.com/sgl-project/sglang
|
||||
Copyright contributors to the SGLang project
|
||||
Licensed under the Apache License 2.0
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
Text Generation Inference
|
||||
--------------------------------------------------------------------------------
|
||||
Original Source: https://github.com/huggingface/text-generation-inference
|
||||
Copyright 2022 Hugging Face
|
||||
Licensed under the Apache License 2.0
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
Transformers
|
||||
--------------------------------------------------------------------------------
|
||||
Original Source: https://github.com/huggingface/transformers
|
||||
Copyright 2018 The HuggingFace Team
|
||||
Licensed under the Apache License 2.0
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
XGrammar
|
||||
--------------------------------------------------------------------------------
|
||||
Original Source: https://github.com/mlc-ai/xgrammar
|
||||
Copyright (c) 2024 by XGrammar Contributors
|
||||
Licensed under the Apache License 2.0
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
vLLM
|
||||
--------------------------------------------------------------------------------
|
||||
Original Source: https://github.com/vllm-project/vllm
|
||||
Copyright contributors to the vLLM project
|
||||
Licensed under the Apache License 2.0
|
||||
|
||||
================================================================================
|
||||
Apache 2.0 LICENSE
|
||||
================================================================================
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
@ -204,3 +281,54 @@ Portions of this project are under the following copyright:
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
================================================================================
|
||||
MIT LICENSE
|
||||
================================================================================
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
================================================================================
|
||||
BSD 3-Clause License
|
||||
================================================================================
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
@ -10,7 +10,7 @@ state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.<
|
||||
[](https://www.python.org/downloads/release/python-31012/)
|
||||
[](https://developer.nvidia.com/cuda-downloads)
|
||||
[](https://pytorch.org)
|
||||
[](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
|
||||
[](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
|
||||
[](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)
|
||||
|
||||
[Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html) | [Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html) | [Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html) | [Documentation](https://nvidia.github.io/TensorRT-LLM/) | [Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
|
||||
|
||||
@ -49,7 +49,7 @@ class RootArgs(BaseModel):
|
||||
return self
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.group(deprecated=True)
|
||||
@click.option(
|
||||
"--tokenizer",
|
||||
required=True,
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
|
||||
*AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -17,13 +18,16 @@
|
||||
*/
|
||||
|
||||
#include "utils.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include <random>
|
||||
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
|
||||
namespace tensorrt_llm::benchmark
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace benchmark
|
||||
{
|
||||
|
||||
std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
|
||||
@ -98,7 +102,8 @@ Samples parseWorkloadJson(
|
||||
if (samples.size() < maxNumSamples)
|
||||
{
|
||||
TLLM_LOG_WARNING(
|
||||
"Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n",
|
||||
"Dataset size %zu is smaller than given max_num_samples "
|
||||
"%d, max_num_samples will be ignored.\n",
|
||||
samples.size(), maxNumSamples);
|
||||
}
|
||||
return samples;
|
||||
@ -160,4 +165,6 @@ std::ostream& operator<<(std::ostream& os, RecordBwMetric const& metric)
|
||||
return os;
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::benchmark
|
||||
} // namespace benchmark
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/executor/executor.h"
|
||||
|
||||
#include <cstdint>
|
||||
@ -29,7 +30,9 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace tensorrt_llm::benchmark
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace benchmark
|
||||
{
|
||||
|
||||
// using namespace tensorrt_llm::batch_manager;
|
||||
@ -237,4 +240,6 @@ std::vector<double> generateRandomExponentialValues(int count, float lambda, int
|
||||
|
||||
std::vector<double> computeTimeDelays(BenchmarkParams const& benchmarkParams, int numDelays);
|
||||
|
||||
} // namespace tensorrt_llm::benchmark
|
||||
} // namespace benchmark
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -1,2 +1,5 @@
|
||||
# These vulnerabilities were inherited from the base image (pytorch:25.10-py3) and should be removed when the base image
|
||||
# is updated.
|
||||
# WAR against https://github.com/advisories/GHSA-gm62-xv2j-4w53
|
||||
# WAR against https://github.com/advisories/GHSA-2xpw-w6gg-jr37
|
||||
urllib3>=2.6.0
|
||||
|
||||
@ -78,9 +78,7 @@ using VecUniqueTokens = tensorrt_llm::runtime::VecUniqueTokens;
|
||||
using LoraTaskIdType = tensorrt_llm::runtime::LoraTaskIdType;
|
||||
using BlocksPerWindow = std::map<SizeType32, std::tuple<SizeType32, SizeType32>>;
|
||||
using CacheSaltIDType = tensorrt_llm::runtime::CacheSaltIDType;
|
||||
|
||||
// Type alias for multimodal hash key (hash array + start offset)
|
||||
using MmKey = std::pair<std::array<uint8_t, 32>, SizeType32>;
|
||||
using MmKey = tensorrt_llm::executor::MmKey;
|
||||
|
||||
template <typename T>
|
||||
using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
|
||||
@ -325,6 +323,8 @@ public:
|
||||
|
||||
size_t getHash() const;
|
||||
|
||||
std::vector<MmKey> getExtraKeys() const;
|
||||
|
||||
private:
|
||||
// Linear ID of block independent of pool
|
||||
IdType mBlockId;
|
||||
|
||||
@ -16,8 +16,9 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
// Base class for algorithms
|
||||
struct Algorithm
|
||||
@ -29,4 +30,4 @@ struct Algorithm
|
||||
Algorithm& operator=(Algorithm const&) = delete;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,9 +17,13 @@
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
//!
|
||||
@ -100,4 +104,6 @@ private:
|
||||
size_type mSize;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,14 +16,19 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
class DebugConfig
|
||||
{
|
||||
public:
|
||||
static bool isCheckDebugEnabled();
|
||||
};
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define TLLM_LIKELY(x) (__assume((x) == 1), (x))
|
||||
#define TLLM_UNLIKELY(x) (__assume((x) == 0), (x))
|
||||
@ -35,8 +40,8 @@ public:
|
||||
#define TLLM_CHECK(val) \
|
||||
do \
|
||||
{ \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \
|
||||
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) \
|
||||
? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
|
||||
} while (0)
|
||||
|
||||
#define TLLM_CHECK_WITH_INFO(val, info, ...) \
|
||||
@ -51,17 +56,17 @@ public:
|
||||
#define TLLM_CHECK_DEBUG(val) \
|
||||
do \
|
||||
{ \
|
||||
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \
|
||||
if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \
|
||||
{ \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0) \
|
||||
: tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) \
|
||||
? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...) \
|
||||
do \
|
||||
{ \
|
||||
if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \
|
||||
if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \
|
||||
{ \
|
||||
TLLM_LIKELY(static_cast<bool>(val)) \
|
||||
? ((void) 0) \
|
||||
|
||||
@ -17,9 +17,13 @@
|
||||
#pragma once
|
||||
|
||||
#include "c10/util/intrusive_ptr.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <Python.h>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
// Adapted from pybind11's example implementation:
|
||||
@ -69,4 +73,6 @@ c10::intrusive_ptr<T> get_intrusive_ptr(PyObject* py_obj, std::string pybind11_a
|
||||
return c10::intrusive_ptr<T>::reclaim_copy(p);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
62
cpp/include/tensorrt_llm/common/config.h
Normal file
62
cpp/include/tensorrt_llm/common/config.h
Normal file
@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#ifndef TRTLLM_CONFIG_H
|
||||
#define TRTLLM_CONFIG_H
|
||||
|
||||
/**
|
||||
* \def TRTLLM_ABI_NAMESPACE
|
||||
* This macro is used to open an implicitly inline namespace block for the ABI version.
|
||||
* This macro can be overridden to change the ABI version.
|
||||
* The default ABI version is _v1.
|
||||
*/
|
||||
#ifndef TRTLLM_ABI_NAMESPACE
|
||||
#define TRTLLM_ABI_NAMESPACE _v1
|
||||
#endif
|
||||
|
||||
#ifndef TRTLLM_ABI_NAMESPACE_BEGIN
|
||||
#define TRTLLM_ABI_NAMESPACE_BEGIN \
|
||||
inline namespace TRTLLM_ABI_NAMESPACE \
|
||||
{
|
||||
#endif
|
||||
|
||||
#ifndef TRTLLM_ABI_NAMESPACE_END
|
||||
#define TRTLLM_ABI_NAMESPACE_END }
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \def TRTLLM_NAMESPACE_BEGIN
|
||||
* This macro is used to open a `tensorrt_llm::` namespace block, along with any
|
||||
* enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
|
||||
* This macro is defined by TensorRT-LLM and may not be overridden.
|
||||
*/
|
||||
#define TRTLLM_NAMESPACE_BEGIN \
|
||||
namespace tensorrt_llm \
|
||||
{ \
|
||||
TRTLLM_ABI_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* \def TRTLLM_NAMESPACE_END
|
||||
* This macro is used to close a `tensorrt_llm::` namespace block, along with any
|
||||
* enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
|
||||
* This macro is defined by TensorRT-LLM and may not be overridden.
|
||||
*/
|
||||
#define TRTLLM_NAMESPACE_END \
|
||||
TRTLLM_ABI_NAMESPACE_END \
|
||||
} /* end namespace tensorrt_llm */
|
||||
|
||||
#endif // TRTLLM_CONFIG_H
|
||||
@ -16,6 +16,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#ifdef ENABLE_FP8
|
||||
#include <cuda_fp8.h>
|
||||
#include <cuda_runtime.h>
|
||||
@ -29,8 +31,8 @@
|
||||
#define USE_QGMMA
|
||||
#endif
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -320,5 +322,6 @@ void invokeComputeScalesAndQuantizeMatrix(T_OUT* output, T_S* quant_ptr, const T
|
||||
const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream);
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
#endif // ENABLE_FP8
|
||||
|
||||
@ -14,12 +14,18 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
/// @brief Populate the start and end profiling iteration indexes from the provided environment variables
|
||||
@ -28,4 +34,6 @@ namespace tensorrt_llm::common
|
||||
std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
|
||||
std::string const& envVarName, std::optional<std::string> const& legacyEnvVarName = std::nullopt);
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
|
||||
#include "tensorrt_llm/common/cudaDriverWrapper.h"
|
||||
#include "tensorrt_llm/common/cudaFp8Utils.h"
|
||||
@ -49,7 +50,9 @@
|
||||
// this undef.
|
||||
#endif // WIN32
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
// workspace for cublas gemm : 32MB
|
||||
@ -1417,7 +1420,9 @@ DEFINE_MEMBER_CHECKER(deq)
|
||||
DEFINE_MEMBER_CHECKER(qua)
|
||||
DEFINE_MEMBER_CHECKER(high_preciecion_normed_output)
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
/*
|
||||
* Macros compliant with TensorRT coding conventions
|
||||
|
||||
@ -16,11 +16,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
|
||||
#include <NvInferRuntime.h>
|
||||
#include <map>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
constexpr static size_t getDTypeSize(nvinfer1::DataType type)
|
||||
@ -84,4 +88,6 @@ constexpr static size_t getDTypeSizeInBits(nvinfer1::DataType type)
|
||||
return "";
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -22,9 +22,12 @@
|
||||
#include <string>
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
class Logger
|
||||
@ -125,12 +128,12 @@ private:
|
||||
|
||||
static inline std::string getPrefix(Level const level)
|
||||
{
|
||||
return fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
|
||||
return tensorrt_llm::common::fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
|
||||
}
|
||||
|
||||
static inline std::string getPrefix(Level const level, int const rank)
|
||||
{
|
||||
return fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
|
||||
return tensorrt_llm::common::fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
|
||||
}
|
||||
};
|
||||
|
||||
@ -171,6 +174,9 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
|
||||
out << std::endl;
|
||||
}
|
||||
}
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
#define TLLM_LOG(level, ...) \
|
||||
do \
|
||||
@ -188,4 +194,3 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
|
||||
#define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__)
|
||||
#define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__)
|
||||
#define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__)
|
||||
} // namespace tensorrt_llm::common
|
||||
|
||||
@ -16,11 +16,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
/**
|
||||
@ -100,4 +104,6 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,12 +16,14 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -480,4 +482,5 @@ public:
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#if ENABLE_BF16
|
||||
#include <cuda_bf16.h>
|
||||
#endif // ENABLE_BF16
|
||||
@ -28,7 +29,9 @@
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
#if ENABLE_BF16
|
||||
static inline std::basic_ostream<char>& operator<<(std::basic_ostream<char>& stream, __nv_bfloat16 const& val)
|
||||
@ -228,4 +231,6 @@ inline void toUpper(std::string& s)
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
|
||||
#include <array>
|
||||
@ -41,7 +42,9 @@
|
||||
tensorrt_llm::common::RequestSpecificException( \
|
||||
__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str(), requestID, errorCode)
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
/// @brief Enumeration of different error codes for request-specific exceptions
|
||||
@ -77,7 +80,8 @@ private:
|
||||
|
||||
[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, char const* info)
|
||||
{
|
||||
throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str());
|
||||
throw TllmException(
|
||||
file, line, tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str());
|
||||
}
|
||||
|
||||
[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "")
|
||||
@ -102,4 +106,6 @@ private:
|
||||
RequestErrorCode mErrorCode;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <initializer_list>
|
||||
#include <string>
|
||||
@ -24,7 +26,9 @@
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
inline bool setThreadName(std::string const& name)
|
||||
@ -43,4 +47,6 @@ bool contains(std::initializer_list<T> const& c, T const& v)
|
||||
return std::find(c.begin(), c.end(), v) != c.end();
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -66,6 +66,7 @@ public:
|
||||
[[nodiscard]] virtual std::vector<Connection const*> getConnections(CommState const& state) = 0;
|
||||
|
||||
[[nodiscard]] virtual CommState const& getCommState() const = 0;
|
||||
[[nodiscard]] virtual bool isRunning() const = 0;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::executor::kv_cache
|
||||
|
||||
@ -47,6 +47,12 @@ class BaseKVCacheManager;
|
||||
namespace tensorrt_llm::executor
|
||||
{
|
||||
|
||||
using SizeType32 = tensorrt_llm::runtime::SizeType32;
|
||||
// Mmkey is used in KVCacheBlock when multimodal data presents in a block.
|
||||
// Type alias for hash array + start offset at per-block granularity.
|
||||
// This differs from the per-request level multimodal hash in MultimodalInput.
|
||||
using MmKey = std::pair<std::array<uint8_t, 32>, SizeType32>;
|
||||
|
||||
/// @brief Version of TRT-LLM
|
||||
char const* version() noexcept;
|
||||
|
||||
@ -1691,12 +1697,14 @@ struct KVCacheStoredBlockData
|
||||
{
|
||||
|
||||
KVCacheStoredBlockData(IdType blockHash, tensorrt_llm::runtime::VecUniqueTokens tokens,
|
||||
std::optional<tensorrt_llm::runtime::LoraTaskIdType> loraId, SizeType32 cacheLevel, SizeType32 priority)
|
||||
std::optional<tensorrt_llm::runtime::LoraTaskIdType> loraId, SizeType32 cacheLevel, SizeType32 priority,
|
||||
std::vector<MmKey> mmKeys = {})
|
||||
: blockHash{blockHash}
|
||||
, tokens{std::move(tokens)}
|
||||
, loraId{loraId}
|
||||
, cacheLevel{cacheLevel}
|
||||
, priority{priority}
|
||||
, mmKeys{std::move(mmKeys)}
|
||||
{
|
||||
}
|
||||
|
||||
@ -1710,6 +1718,8 @@ struct KVCacheStoredBlockData
|
||||
SizeType32 cacheLevel;
|
||||
/// @brief The priority of the block
|
||||
SizeType32 priority;
|
||||
/// @brief The multimodal keys of the block
|
||||
std::vector<MmKey> mmKeys;
|
||||
};
|
||||
|
||||
struct KVCacheStoredData
|
||||
|
||||
@ -16,7 +16,11 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace tensorrt_llm::kernels
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
namespace detail
|
||||
@ -110,4 +114,6 @@ inline constexpr bool is_compatible_v = is_compatible<Arch>::value;
|
||||
|
||||
} // namespace arch
|
||||
|
||||
} // namespace tensorrt_llm::kernels
|
||||
} // namespace kernels
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,11 +17,14 @@
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/executor/types.h"
|
||||
#include <cstdint>
|
||||
#include <curand_kernel.h>
|
||||
|
||||
namespace tensorrt_llm::kernels
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
class FinishedState
|
||||
@ -308,4 +311,6 @@ template <typename T>
|
||||
void invokeScatterDecodingParams(
|
||||
T const* src, T scalar, T* dst, int const* batchSlots, int batchSize, cudaStream_t stream);
|
||||
|
||||
} // namespace tensorrt_llm::kernels
|
||||
} // namespace kernels
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,11 +17,14 @@
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace tensorrt_llm::kernels
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
class KVCacheIndex
|
||||
@ -53,4 +56,6 @@ private:
|
||||
UnderlyingType value;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::kernels
|
||||
} // namespace kernels
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,16 +14,18 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/runtime/iBuffer.h"
|
||||
|
||||
using namespace tensorrt_llm::runtime;
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
|
||||
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
|
||||
cudaStream_t stream);
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -1,18 +1,18 @@
|
||||
# ##################################################################################################
|
||||
# Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2011-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification, are not permit-
|
||||
# ted.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFIT;
|
||||
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ##################################################################################################
|
||||
|
||||
# #################################################################################################
|
||||
|
||||
@ -200,38 +200,22 @@ ns_close = r"""
|
||||
|
||||
copyright = '''\
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are not permit-
|
||||
* ted.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
**************************************************************************************************/
|
||||
''' if not generate_cu_trtllm else r"""/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
|
||||
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
"""
|
||||
'''
|
||||
|
||||
makefile_template = '''\
|
||||
|
||||
@ -2175,7 +2159,8 @@ def get_kernel_code(kspec, kname, lname):
|
||||
params_str = 'reinterpret_cast<bert::Fused_multihead_attention_params_v2 &>(params)' if generate_cu_trtllm else 'params'
|
||||
attn_mask_type_str = 'using Attention_mask_type = ContextAttentionMaskType;' if generate_cu_trtllm else 'using Attention_mask_type = fmha::Attention_mask_type;'
|
||||
bert_launch_params = '' if generate_cu_trtllm else 'using Launch_params = bert::Fused_multihead_attention_launch_params;'
|
||||
include_str = '#include "../fused_multihead_attention_common.h"' if generate_cu_trtllm else ''
|
||||
include_str = '#include "../fused_multihead_attention_common.h"\n' if generate_cu_trtllm else ''
|
||||
include_str += '#include "tensorrt_llm/common/config.h"' if generate_cu_trtllm else ''
|
||||
num_compute_groups_str = '' if generate_cu_trtllm else 'static constexpr int NUM_COMPUTE_GROUPS = 2;'
|
||||
fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'{params_type}'
|
||||
const_fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'const {params_type}'
|
||||
@ -2201,8 +2186,19 @@ def get_kernel_code(kspec, kname, lname):
|
||||
const int COMPUTE_REG_COUNT = {compute_reg_count};
|
||||
asm volatile("{{setmaxnreg.inc.sync.aligned.u32 %0; \n\t}}" ::"n"(COMPUTE_REG_COUNT));'''.format(
|
||||
compute_reg_count=compute_reg_count)
|
||||
local_ns_open = ns_open if generate_cu_trtllm else ''
|
||||
local_ns_close = ns_close if generate_cu_trtllm else ''
|
||||
abi_ns_open = r"""
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
namespace kernels
|
||||
{
|
||||
// clang-format off
|
||||
"""
|
||||
abi_ns_close = r"""
|
||||
// clang-format on
|
||||
} // namespace kernels
|
||||
TRTLLM_NAMESPACE_END
|
||||
"""
|
||||
local_ns_open = abi_ns_open if generate_cu_trtllm else ''
|
||||
local_ns_close = abi_ns_close if generate_cu_trtllm else ''
|
||||
|
||||
tmp = dict(locals(), **kspec._asdict())
|
||||
|
||||
@ -3077,8 +3073,10 @@ def use_cubin_header(sm, head_size, dtype, output_dtype=None):
|
||||
def get_cubin_header(kernel_traits, specs_names):
|
||||
cubins = []
|
||||
cubin_lens = []
|
||||
launchers = []
|
||||
cubins_dict = {}
|
||||
cubin_lens_dict = {}
|
||||
launchers_dict = {}
|
||||
for kspec, fname, lname, kname in specs_names:
|
||||
if generate_cu_trtllm and not use_cubin_header(
|
||||
kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype):
|
||||
@ -3282,11 +3280,11 @@ def get_cubin_header(kernel_traits, specs_names):
|
||||
if generate_cu_trtllm and lname != 'nullptr':
|
||||
launcher = 'extern void {lname}(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);'.format(
|
||||
lname=lname)
|
||||
if int(sm) in cubins_dict:
|
||||
if launcher not in cubins_dict[int(sm)]:
|
||||
cubins_dict[int(sm)].append(launcher)
|
||||
if int(sm) in launchers_dict:
|
||||
if launcher not in launchers_dict[int(sm)]:
|
||||
launchers_dict[int(sm)].append(launcher)
|
||||
else:
|
||||
cubins_dict[int(sm)] = [launcher]
|
||||
launchers_dict[int(sm)] = [launcher]
|
||||
elif 'mhca' in kname:
|
||||
code = '''\
|
||||
{{ DATA_TYPE_{prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, kSM_{sm}, {cubin_name}, {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {is_il} }}\
|
||||
@ -3309,17 +3307,33 @@ def get_cubin_header(kernel_traits, specs_names):
|
||||
else:
|
||||
metadata_v2 = ',\n'.join(metadata_v2)
|
||||
# Add macros to only include needed cubins during compilation.
|
||||
for sm in cubins_dict.keys():
|
||||
# Collect all SM versions from all dictionaries
|
||||
all_sms = sorted(
|
||||
set(
|
||||
list(cubins_dict.keys()) + list(cubin_lens_dict.keys()) +
|
||||
list(launchers_dict.keys())))
|
||||
|
||||
for sm in all_sms:
|
||||
macro_begin = f"#ifndef EXCLUDE_SM_{sm}"
|
||||
macro_end = f"#endif\n"
|
||||
cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end])
|
||||
|
||||
# Add cubin array declarations
|
||||
if sm in cubins_dict:
|
||||
cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end])
|
||||
|
||||
# Add cubin length declarations
|
||||
if sm in cubin_lens_dict:
|
||||
cubin_lens.extend([macro_begin] + cubin_lens_dict[sm] + [macro_end])
|
||||
|
||||
# Add launcher declarations
|
||||
if sm in launchers_dict:
|
||||
launchers.extend([macro_begin] + launchers_dict[sm] + [macro_end])
|
||||
|
||||
unroll_config_v1 = ',\n'.join(unroll_config_v1)
|
||||
unroll_config_v2 = ',\n'.join(unroll_config_v2)
|
||||
cubins = '\n'.join(cubins)
|
||||
cubin_lens = '\n'.join(cubin_lens)
|
||||
launchers = '\n'.join(launchers)
|
||||
local_ns_open = ns_open
|
||||
local_ns_close = ns_close if generate_cu_trtllm else '}'
|
||||
launcher_line = '''
|
||||
@ -3431,7 +3445,157 @@ static const struct TestMetaV2
|
||||
|
||||
'''.format(**locals(), copyright=copyright)
|
||||
|
||||
return code
|
||||
# Generate header content (.h file)
|
||||
if "GENERATE_CUBIN" in os.environ:
|
||||
header_content = '''\
|
||||
{copyright}
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
namespace kernels{{
|
||||
|
||||
struct FusedMultiHeadAttentionKernelMetaInfoV2
|
||||
{{
|
||||
Data_type mDataTypeIn;
|
||||
Data_type mDataTypeOut;
|
||||
unsigned int mS;
|
||||
unsigned int mStepQ;
|
||||
unsigned int mStepKV;
|
||||
unsigned int mD;
|
||||
unsigned int mDV;
|
||||
unsigned int mSageBlockSizeQ;
|
||||
unsigned int mSageBlockSizeK;
|
||||
unsigned int mSageBlockSizeV;
|
||||
unsigned int mSM;
|
||||
const unsigned char* mCubin;
|
||||
unsigned int mCubinSize;
|
||||
const char* mFuncName;
|
||||
unsigned int mSharedMemBytes;
|
||||
unsigned int mThreadsPerCTA;
|
||||
unsigned int mUnrollStep;
|
||||
int mAttentionMaskType;
|
||||
int mAttentionInputLayout;
|
||||
bool mInterleaved;
|
||||
bool mFlashAttention;
|
||||
bool mWarpSpecialization;
|
||||
bool mFP32Accumulation;
|
||||
bool mAlibiSupported;
|
||||
bool mTiled;
|
||||
bool mEnableAttnLogitSoftcapping;
|
||||
bool mReturnSoftmaxStats;{launcher_line}
|
||||
}};
|
||||
|
||||
extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[];
|
||||
extern const int sMhaKernelMetaInfosV2Size;
|
||||
|
||||
}} // namespace kernels
|
||||
TRTLLM_NAMESPACE_END
|
||||
'''.format(**locals(), copyright=copyright)
|
||||
# Generate source content (.cpp file)
|
||||
source_content = '''\
|
||||
{copyright}
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
{local_ns_open}
|
||||
|
||||
//--- Cubin Arrays
|
||||
{cubins}
|
||||
|
||||
//--- Cubin Lengths
|
||||
{cubin_lens}
|
||||
|
||||
{local_ns_close}
|
||||
|
||||
using namespace tensorrt_llm::kernels;
|
||||
|
||||
namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels {{
|
||||
|
||||
class Fused_multihead_attention_params_v2;
|
||||
class Launch_params;
|
||||
|
||||
//--- Kernel Launchers
|
||||
{launchers}
|
||||
|
||||
// FIXME: These are duplicated declarations, we should remove them in the future.
|
||||
constexpr int32_t kSM_70 = 70;
|
||||
constexpr int32_t kSM_72 = 72;
|
||||
constexpr int32_t kSM_75 = 75;
|
||||
constexpr int32_t kSM_80 = 80;
|
||||
constexpr int32_t kSM_86 = 86;
|
||||
constexpr int32_t kSM_89 = 89;
|
||||
constexpr int32_t kSM_90 = 90;
|
||||
constexpr int32_t kSM_100 = 100;
|
||||
constexpr int32_t kSM_100f = 10100;
|
||||
constexpr int32_t kSM_103 = 103;
|
||||
constexpr int32_t kSM_120 = 120;
|
||||
constexpr int32_t kSM_121 = 121;
|
||||
|
||||
// FIXME: These are duplicated declarations, we should remove them in the future.
|
||||
enum Data_type
|
||||
{{
|
||||
DATA_TYPE_BOOL,
|
||||
DATA_TYPE_FP16,
|
||||
DATA_TYPE_FP32,
|
||||
DATA_TYPE_INT4,
|
||||
DATA_TYPE_INT8,
|
||||
DATA_TYPE_INT32,
|
||||
DATA_TYPE_BF16,
|
||||
DATA_TYPE_E2M1,
|
||||
DATA_TYPE_E4M3,
|
||||
DATA_TYPE_E5M2
|
||||
}};
|
||||
|
||||
struct FusedMultiHeadAttentionKernelMetaInfoV2
|
||||
{{
|
||||
Data_type mDataTypeIn;
|
||||
Data_type mDataTypeOut;
|
||||
unsigned int mS;
|
||||
unsigned int mStepQ;
|
||||
unsigned int mStepKV;
|
||||
unsigned int mD;
|
||||
unsigned int mDV;
|
||||
unsigned int mSageBlockSizeQ;
|
||||
unsigned int mSageBlockSizeK;
|
||||
unsigned int mSageBlockSizeV;
|
||||
unsigned int mSM;
|
||||
const unsigned char* mCubin;
|
||||
unsigned int mCubinSize;
|
||||
const char* mFuncName;
|
||||
unsigned int mSharedMemBytes;
|
||||
unsigned int mThreadsPerCTA;
|
||||
unsigned int mUnrollStep;
|
||||
int mAttentionMaskType;
|
||||
int mAttentionInputLayout;
|
||||
bool mInterleaved;
|
||||
bool mFlashAttention;
|
||||
bool mWarpSpecialization;
|
||||
bool mFP32Accumulation;
|
||||
bool mAlibiSupported;
|
||||
bool mTiled;
|
||||
bool mEnableAttnLogitSoftcapping;
|
||||
bool mReturnSoftmaxStats;{launcher_line}
|
||||
}};
|
||||
|
||||
extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[] = {{
|
||||
{metadata_v2}
|
||||
}};
|
||||
|
||||
extern const int sMhaKernelMetaInfosV2Size = sizeof(sMhaKernelMetaInfosV2) / sizeof(sMhaKernelMetaInfosV2[0]);
|
||||
}} // namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels
|
||||
'''.format(**locals(), copyright=copyright)
|
||||
else:
|
||||
# Non-GENERATE_CUBIN mode: use old behavior
|
||||
header_content = code
|
||||
source_content = None
|
||||
|
||||
return header_content, source_content
|
||||
|
||||
|
||||
# This is used to add some kernels running in cubins for passing CI cases.
|
||||
@ -3449,9 +3613,20 @@ def modify_cubin_header(cubin_header):
|
||||
return result
|
||||
|
||||
target = "#ifndef EXCLUDE_SM_80"
|
||||
addition = """extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
|
||||
extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;"""
|
||||
result = add_kernel_line(result, target, addition)
|
||||
addition_cubin_array = """
|
||||
#ifndef EXCLUDE_SM_80
|
||||
extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
|
||||
#endif
|
||||
"""
|
||||
addition_cubin_length = """
|
||||
#ifndef EXCLUDE_SM_80
|
||||
extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;
|
||||
#endif
|
||||
"""
|
||||
# Add cubin array and length into there corresponding sections.
|
||||
result = add_kernel_line(result, "//--- Cubin Arrays", addition_cubin_array)
|
||||
result = add_kernel_line(result, "//--- Cubin Lengths",
|
||||
addition_cubin_length)
|
||||
|
||||
def modify_kernel_line(result, target, new_line):
|
||||
lines = result.split('\n')
|
||||
@ -3534,13 +3709,22 @@ def generate_files(specs_names):
|
||||
output = output.decode('utf-8').strip()
|
||||
# this gives: kname, smem bytes, threads_per_cta, loop_step
|
||||
kernel_traits = [traits.split() for traits in output.splitlines()]
|
||||
cubin_header = get_cubin_header(kernel_traits, valid_specs_names)
|
||||
# Use new function to generate both fmha_cubin.h and fmha_cubin.cpp files
|
||||
# To switch back to old behavior, replace get_cubin_header_and_source with get_cubin_header
|
||||
cubin_header, cubin_source = get_cubin_header(kernel_traits,
|
||||
valid_specs_names)
|
||||
if generate_cu_trtllm:
|
||||
cubin_header = modify_cubin_header(cubin_header)
|
||||
cubin_source = modify_cubin_header(cubin_source)
|
||||
|
||||
# Write fmha_cubin.h file
|
||||
with open('./generated/fmha_cubin.h', 'w') as f:
|
||||
f.write(cubin_header)
|
||||
|
||||
# Write fmha_cubin.cpp file (same directory as fmha_cubin.h file)
|
||||
if cubin_source is not None:
|
||||
with open('./generated/fmha_cubin.cpp', 'w') as f:
|
||||
f.write(cubin_source)
|
||||
|
||||
|
||||
def enumerate_hgmma_tma_kernels(specs, sm=90):
|
||||
specs.append(
|
||||
|
||||
@ -1,18 +1,18 @@
|
||||
# ##################################################################################################
|
||||
# Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification, are not permit-
|
||||
# ted.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFIT;
|
||||
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ##################################################################################################
|
||||
|
||||
# #################################################################################################
|
||||
|
||||
@ -32,20 +32,20 @@ dtype2traits = {
|
||||
|
||||
fmha_dgrad_v2_flash_attention_template = '''\
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are not permit-
|
||||
* ted.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
**************************************************************************************************/
|
||||
|
||||
#include "fused_multihead_attention_fprop.h"
|
||||
@ -157,20 +157,20 @@ void run_fmha_dgrad_v2_flash_attention_{dtype}_S_{head_size}_sm{sm}(
|
||||
|
||||
fmha_fprop_v2_flash_attention_template = '''\
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are not permit-
|
||||
* ted.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
**************************************************************************************************/
|
||||
|
||||
#include "fused_multihead_attention_fprop.h"
|
||||
|
||||
@ -127,7 +127,9 @@ TEMPLATE_PROLOGUE = '''/*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
namespace tensorrt_llm {
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
namespace kernels {
|
||||
'''
|
||||
|
||||
@ -136,7 +138,8 @@ inline constexpr const char* {fname_var_name} = "{fname}";
|
||||
'''
|
||||
|
||||
TEMPLATE_EPILOGUE = '''}
|
||||
}
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
'''
|
||||
|
||||
D = defaultdict(list)
|
||||
|
||||
@ -86,8 +86,10 @@ cpp_file_prefix_text = R"""/*
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
namespace kernels
|
||||
{
|
||||
// clang-format off
|
||||
@ -96,7 +98,7 @@ namespace kernels
|
||||
cpp_file_suffex_text = R"""
|
||||
// clang-format on
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
TRTLLM_NAMESPACE_END
|
||||
"""
|
||||
|
||||
cubin_meta_info_struct_prefix_text = R"""
|
||||
|
||||
@ -466,20 +466,53 @@ using WarpAcc = WarpAccT<warpTile.y, warpTile.x>;
|
||||
#define MMAS_N_PER_MASK 2
|
||||
|
||||
__device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskType const* mask, uint32_t rowOffset,
|
||||
uint32_t nbValidCols, uint32_t qSeqLen, uint32_t actualQSeqLen, uint32_t headGrpSize)
|
||||
uint32_t nbValidCols, uint32_t qSeqLen, uint32_t actualQSeqLen, uint32_t headGrpSize
|
||||
#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE
|
||||
,
|
||||
int32_t tok0WinBeg, uint32_t seqIter, uint32_t const cacheSeqLen, uint32_t const warpTileTokenBeg
|
||||
#endif
|
||||
)
|
||||
{
|
||||
uint32_t const idxInQuad = laneId() % 4;
|
||||
uint32_t const idxQuad = laneId() / 4;
|
||||
// Packed mask is aligned with 32 bits (2 uint16_t).
|
||||
uint32_t const nbPackedMasksPerRow = divUp(qSeqLen, 32u) * 2u;
|
||||
uint16_t const* uint16Mask = reinterpret_cast<uint16_t const*>(mask);
|
||||
constexpr uint64_t fullMask = ~uint64_t{0};
|
||||
#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE
|
||||
Range const tileRange = {warpTileTokenBeg, warpTileTokenBeg + warpTile.x};
|
||||
Range const maxMaskOutRange = {0, mha::max(0, tok0WinBeg) + (nbValidRows / MMAS_N_PER_MASK - 1)};
|
||||
bool const ctaNeedBegMask = tileRange.beg < maxMaskOutRange.end;
|
||||
assert(ctaNeedBegMask == overlap(tileRange, maxMaskOutRange));
|
||||
int32_t const tok0NbMaskOut = int32_t(tok0WinBeg) - int32_t(warpTileTokenBeg);
|
||||
uint32_t const nbSeqItersWithoutSpecDecMask = (cacheSeqLen - actualQSeqLen) / ctaTile.x;
|
||||
bool const ctaNeedSpecDecMask = (seqIter >= nbSeqItersWithoutSpecDecMask);
|
||||
#else
|
||||
constexpr bool ctaNeedBegMask = false;
|
||||
bool const ctaNeedSpecDecMask = true;
|
||||
int32_t const tok0NbMaskOut = -2147483648;
|
||||
#endif
|
||||
bool const needMask = ctaNeedBegMask || ctaNeedSpecDecMask;
|
||||
|
||||
if (!needMask)
|
||||
{
|
||||
return;
|
||||
}
|
||||
#pragma unroll
|
||||
for (uint32_t m = 0; m < acc.rows; m++)
|
||||
{
|
||||
#pragma unroll
|
||||
for (uint32_t i = 0; i < InstAcc::rows; i++)
|
||||
{
|
||||
uint32_t const tokenRow = min((rowOffset + instM * m + idxQuad + i * 8) / headGrpSize, actualQSeqLen - 1);
|
||||
uint32_t const idxQTokInCta = (rowOffset + instM * m + idxQuad + i * 8) / headGrpSize;
|
||||
uint32_t const tokenRow = min(idxQTokInCta, actualQSeqLen - 1);
|
||||
#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE
|
||||
int32_t const begNbMaskOut = tok0NbMaskOut + int32_t(idxQTokInCta);
|
||||
uint64_t const begMask = (begNbMaskOut > 0 ? fullMask << begNbMaskOut : fullMask);
|
||||
#else
|
||||
uint64_t const begMask = fullMask;
|
||||
#endif
|
||||
|
||||
#pragma unroll
|
||||
for (uint32_t mask_n = 0; mask_n < acc.cols / MMAS_N_PER_MASK; mask_n++)
|
||||
{
|
||||
@ -491,12 +524,15 @@ __device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskTy
|
||||
uint32_t const maskPos1 = lastCol + actualQSeqLen < nbValidCols
|
||||
? 0u
|
||||
: min(lastCol + actualQSeqLen - nbValidCols, actualQSeqLen - 1);
|
||||
uint32_t packedMask = 0u;
|
||||
uint32_t const maskPosStart = (maskPos0 / 16) * 16;
|
||||
reinterpret_cast<uint16_t*>(&packedMask)[0]
|
||||
= uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos0 / 16)];
|
||||
reinterpret_cast<uint16_t*>(&packedMask)[1]
|
||||
= uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos1 / 16)];
|
||||
uint32_t packedMask = ~uint32_t{0};
|
||||
if (ctaNeedSpecDecMask)
|
||||
{
|
||||
reinterpret_cast<uint16_t*>(&packedMask)[0]
|
||||
= uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos0 / 16)];
|
||||
reinterpret_cast<uint16_t*>(&packedMask)[1]
|
||||
= uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos1 / 16)];
|
||||
}
|
||||
#pragma unroll
|
||||
for (uint32_t nj = 0; nj < MMAS_N_PER_MASK; nj++)
|
||||
{
|
||||
@ -510,7 +546,11 @@ __device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskTy
|
||||
bool const maskFlag = col + actualQSeqLen < nbValidCols
|
||||
? true
|
||||
: packedMask & (1u << ((col + actualQSeqLen - nbValidCols) - maskPosStart));
|
||||
acc(m, n)(i, j) = maskFlag && col < nbValidCols ? acc(m, n)(i, j) : safeInitRowMax;
|
||||
|
||||
bool const begMaskFlag = ctaNeedBegMask ? (begMask & (1ULL << col)) : true;
|
||||
|
||||
acc(m, n)(i, j)
|
||||
= maskFlag && begMaskFlag && col < nbValidCols ? acc(m, n)(i, j) : safeInitRowMax;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1611,8 +1651,14 @@ CUBIN_EXPORT __global__
|
||||
#endif
|
||||
|
||||
uint32_t const cacheSeqLen = getCacheSeqLen<usePagedKVCache>(cacheList, idxReq);
|
||||
#if SLIDING_WINDOW
|
||||
#if SLIDING_WINDOW && SPEC_DEC && !IS_SPEC_DEC_TREE
|
||||
uint32_t const tok0SeqLen = cacheSeqLen - actualQSeqLen + 1 + idxHeadTokenInGrp; // ctaTokOffset;
|
||||
int32_t const tok0WinBeg = int32_t(tok0SeqLen) - int32_t(slidingWinSize);
|
||||
uint32_t const nbTotalSkipTokens = mha::max(0, tok0WinBeg);
|
||||
|
||||
#elif SLIDING_WINDOW
|
||||
bool const rtIsReallySliding = (cacheSeqLen > slidingWinSize);
|
||||
assert(!SPEC_DEC || !rtIsReallySliding);
|
||||
uint32_t const nbTotalSkipTokens = rtIsReallySliding ? cacheSeqLen - slidingWinSize : 0;
|
||||
#else
|
||||
constexpr bool rtIsReallySliding = false;
|
||||
@ -1626,7 +1672,9 @@ CUBIN_EXPORT __global__
|
||||
#endif
|
||||
|
||||
uint32_t const nbSeqIters = useKVCache ? divUp(cacheSeqLen, ctaTile.x) : 0;
|
||||
#if SPEC_DEC
|
||||
#if SLIDING_WINDOW && SPEC_DEC && !IS_SPEC_DEC_TREE
|
||||
uint32_t const nbSeqItersWithoutMask = nbSkipLeadingTiles;
|
||||
#elif SPEC_DEC
|
||||
uint32_t const nbSeqItersWithoutMask = (cacheSeqLen - actualQSeqLen) / ctaTile.x;
|
||||
#endif
|
||||
|
||||
@ -1912,8 +1960,12 @@ CUBIN_EXPORT __global__
|
||||
if (seqIter >= nbSeqItersWithoutMask)
|
||||
{
|
||||
uint32_t const nbValidCols = (warpTileTokenBeg < cacheSeqLen ? cacheSeqLen - warpTileTokenBeg : 0U);
|
||||
applyMaskFromInput(
|
||||
warp, acc, mask, idxHeadTokenInGrp, nbValidCols, qSeqLen, actualQSeqLen, headGrpSize);
|
||||
applyMaskFromInput(warp, acc, mask, idxHeadTokenInGrp, nbValidCols, qSeqLen, actualQSeqLen, headGrpSize
|
||||
#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE
|
||||
,
|
||||
tok0WinBeg, seqIter, cacheSeqLen, warpTileTokenBeg
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#else
|
||||
bool const isFirstIter = (seqIter == nbSkipLeadingTiles);
|
||||
|
||||
@ -247,7 +247,8 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
|
||||
{
|
||||
break;
|
||||
}
|
||||
else if (req->isGenerationInProgressState())
|
||||
|
||||
if (req->isGenerationInProgressState())
|
||||
{
|
||||
scheduledRequests.emplace_back(req);
|
||||
reservedBlocks.decrementReservedBlocks(*req);
|
||||
@ -296,7 +297,8 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
|
||||
{
|
||||
break;
|
||||
}
|
||||
else if (req->isContextInitState() || req->isDisaggGenerationInitState())
|
||||
|
||||
if (req->isContextInitState() || req->isDisaggGenerationInitState())
|
||||
{
|
||||
bool enoughBlocks = reservedBlocks.enoughAvailableBlocks(*req);
|
||||
bool enoughCrossBlocks
|
||||
|
||||
@ -360,6 +360,12 @@ public:
|
||||
RequestInfo info;
|
||||
auto const* connection = isAgent ? agentConnectionManager->recvConnectionAndRequestInfo(info)
|
||||
: mManager->recvConnect(DataContext{TransceiverTag::kID_TAG}, &id, sizeof(id));
|
||||
if (connection == nullptr && !mManager->isRunning())
|
||||
{
|
||||
TLLM_LOG_WARNING(" recvRequestInfo connection is nullptr, maybe the server is terminating");
|
||||
return info;
|
||||
}
|
||||
|
||||
if (!isAgent)
|
||||
{
|
||||
TLLM_CHECK(id == TransceiverTag::Id::REQUEST_SEND);
|
||||
@ -616,6 +622,10 @@ private:
|
||||
if (!mReadyResponses.empty())
|
||||
{
|
||||
auto const& requestInfo = recvRequestInfo();
|
||||
if (mTerminate || !mManager->isRunning())
|
||||
{
|
||||
return;
|
||||
}
|
||||
auto reqId = requestInfo.getRequestId();
|
||||
|
||||
{
|
||||
|
||||
@ -102,7 +102,7 @@ void KVCacheEventManager::enqueueStoredEvent(std::vector<BlockPtr> const& blocks
|
||||
for (auto const& block : blocks)
|
||||
{
|
||||
data.blocks.emplace_back(block->getHash(), block->getUniqueTokens(), block->getBlockKey().loraTaskId,
|
||||
block->isPrimary() ? kPrimaryLevel : kSecondaryLevel, block->getPriority());
|
||||
block->isPrimary() ? kPrimaryLevel : kSecondaryLevel, block->getPriority(), block->getExtraKeys());
|
||||
}
|
||||
|
||||
enqueueEvent({mEventId++, data, windowSize, mAttentionDpRank});
|
||||
|
||||
@ -284,6 +284,11 @@ tk::KVCacheIndex::UnderlyingType KVCacheBlock::getMemoryPoolBlockIndex() const
|
||||
return mMemoryPoolBlockIndex.get();
|
||||
}
|
||||
|
||||
std::vector<MmKey> KVCacheBlock::getExtraKeys() const
|
||||
{
|
||||
return mBlockKey.extraKeys;
|
||||
}
|
||||
|
||||
bool KVCacheBlock::isPrimary() const
|
||||
{
|
||||
return mMemoryPoolBlockIndex.isPrimary();
|
||||
|
||||
@ -27,7 +27,7 @@ bool initCheckDebug()
|
||||
}
|
||||
} // namespace
|
||||
|
||||
bool DebugConfig::isCheckDebugEnabled()
|
||||
bool tensorrt_llm::DebugConfig::isCheckDebugEnabled()
|
||||
{
|
||||
static bool const debugEnabled = initCheckDebug();
|
||||
return debugEnabled;
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
*/
|
||||
#include "attentionOp.h"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/envUtils.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/memoryUtils.h"
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cublasMMWrapper.h"
|
||||
#include "tensorrt_llm/common/opUtils.h"
|
||||
#include "tensorrt_llm/common/quantization.h"
|
||||
@ -36,7 +37,9 @@
|
||||
#include <nccl.h>
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
|
||||
namespace tensorrt_llm::common::op
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::op
|
||||
{
|
||||
|
||||
class AttentionOp
|
||||
@ -543,4 +546,6 @@ private:
|
||||
UniqPtrWNullCopy<int32_t[], Deleter> mMultiBlockSemaphores = {};
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common::op
|
||||
} // namespace common::op
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#include "tensorrt_llm/common/cublasMMWrapper.h"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cublasVersionCheck.h"
|
||||
#include <algorithm>
|
||||
#include <unordered_map>
|
||||
@ -24,8 +25,8 @@
|
||||
#error CUDART_VERSION Undefined!
|
||||
#endif
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -661,4 +662,4 @@ void CublasMMWrapper::BlockScaleGemm(cublasOperation_t transa, cublasOperation_t
|
||||
|
||||
} // namespace common
|
||||
|
||||
} // namespace tensorrt_llm
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include <cublasLt.h>
|
||||
#include <cublas_v2.h>
|
||||
@ -24,8 +25,8 @@
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -185,4 +186,4 @@ public:
|
||||
|
||||
} // namespace common
|
||||
|
||||
} // namespace tensorrt_llm
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,12 +16,13 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -291,7 +292,8 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _
|
||||
#endif // ENABLE_BF16
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
// Operator definitions intentionally in global namespace
|
||||
namespace
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include <assert.h>
|
||||
#include <cstdlib>
|
||||
@ -28,8 +29,8 @@
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
static __host__ __device__ int hash(int val)
|
||||
@ -673,4 +674,5 @@ struct MultiProducerCircularBuffer : public CircularBuffer<DEPTH, CTAS_PER_CGA>
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
|
||||
#define dllOpen(name) LoadLibrary("nv" name ".dll")
|
||||
#define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
|
||||
#define dllGetSym(handle, name) static_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name))
|
||||
@ -29,6 +30,7 @@
|
||||
#endif // defined(_WIN32)
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaDriverWrapper.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include <cuda.h>
|
||||
@ -36,7 +38,9 @@
|
||||
#include <cstdio>
|
||||
#include <mutex>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
std::shared_ptr<CUDADriverWrapper> CUDADriverWrapper::getInstance()
|
||||
@ -295,4 +299,6 @@ CUresult CUDADriverWrapper::cuOccupancyMaxActiveClusters(
|
||||
return (*_cuOccupancyMaxActiveClusters)(maxActiveClusters, f, config);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
#ifndef CUDA_DRIVER_WRAPPER_H
|
||||
#define CUDA_DRIVER_WRAPPER_H
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
|
||||
@ -25,7 +26,9 @@
|
||||
#include <cstdio>
|
||||
#include <memory>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
class CUDADriverWrapper
|
||||
@ -165,8 +168,9 @@ void checkDriverExitSafe(T result, char const* const func, char const* const fil
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
/*
|
||||
* Macros compliant with TensorRT coding conventions
|
||||
*/
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaFp8Utils.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/envUtils.h"
|
||||
@ -24,8 +25,8 @@
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
#ifdef ENABLE_FP8
|
||||
@ -466,4 +467,5 @@ DEFINE_INVOKE_QUANTIZE_MATRIX(__nv_bfloat16, float, __nv_fp8_e4m3);
|
||||
|
||||
#endif // ENABLE_FP8
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/cudaProfilerUtils.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
#include <cstdint>
|
||||
@ -54,7 +55,9 @@ std::tuple<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIte
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
|
||||
@ -81,4 +84,6 @@ std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIter
|
||||
return std::make_pair(profileIterIdxs, stopIterIdxs);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -25,9 +25,10 @@
|
||||
#if ENABLE_BF16
|
||||
#include <cuda_bf16.h>
|
||||
#endif
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -749,4 +750,5 @@ __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val)
|
||||
#endif // ENABLE_FP8
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/envUtils.h"
|
||||
#include "tensorrt_llm/kernels/customAllReduceKernels.h"
|
||||
@ -25,7 +26,9 @@
|
||||
using tensorrt_llm::kernels::AllReduceFusionOp;
|
||||
using tensorrt_llm::kernels::AllReduceStrategyType;
|
||||
|
||||
namespace tensorrt_llm::utils::customAllReduceUtils
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace utils::customAllReduceUtils
|
||||
{
|
||||
|
||||
constexpr size_t NUM_POINTERS_PER_RANK = 7;
|
||||
@ -292,4 +295,6 @@ inline const std::unordered_map<int, AllReduceBestStrategyTableType> AllReduceBe
|
||||
{90, AllReduceBestStrategyTableSM90},
|
||||
{100, AllReduceBestStrategyTableSM100},
|
||||
};
|
||||
} // namespace tensorrt_llm::utils::customAllReduceUtils
|
||||
} // namespace utils::customAllReduceUtils
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
*/
|
||||
|
||||
#include "envUtils.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
@ -25,7 +26,9 @@
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
std::optional<int32_t> getIntEnv(char const* name)
|
||||
@ -528,4 +531,6 @@ bool getEnvEplbForceGdrcopy()
|
||||
return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY");
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,13 +16,16 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
// Useful when you want to inject some debug code controllable with env var.
|
||||
std::optional<int32_t> getIntEnv(char const* name);
|
||||
@ -153,4 +156,6 @@ bool getEnvKVCacheTransferAllBlocksForWindow();
|
||||
|
||||
bool getEnvEplbForceGdrcopy();
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -19,6 +19,7 @@
|
||||
#ifndef TRTLLM_CUDA_LAMPORT_UTILS_CUH
|
||||
#define TRTLLM_CUDA_LAMPORT_UTILS_CUH
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <array>
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
@ -29,7 +30,9 @@
|
||||
|
||||
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
constexpr uint16_t kNEGZERO_FP16 = 0x8000U;
|
||||
@ -279,6 +282,7 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
#endif // TRTLLM_CUDA_LAMPORT_UTILS_CUH
|
||||
|
||||
@ -15,12 +15,15 @@
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
Logger::Logger()
|
||||
@ -70,4 +73,6 @@ Logger* Logger::getLogger()
|
||||
thread_local Logger instance;
|
||||
return &instance;
|
||||
}
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,10 +16,11 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -34,4 +35,5 @@ inline __device__ __host__ T divUp(T m, T n)
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,11 +14,15 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "mcastDevMemUtils.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <unordered_map>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
using McastDeviceMemory = ::tensorrt_llm::runtime::McastDeviceMemory;
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
|
||||
|
||||
namespace
|
||||
{
|
||||
@ -84,4 +88,6 @@ McastDeviceMemory* findMcastDevMemBuffer(void* ptr)
|
||||
{
|
||||
return McastDevMemBufferRegistry::getInstance().findBuffer(ptr);
|
||||
}
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,13 +15,17 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
// Avoid circular dependency
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
namespace tensorrt_llm::runtime
|
||||
{
|
||||
class McastDeviceMemory;
|
||||
}
|
||||
} // namespace tensorrt_llm::runtime
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
// Avoid circular dependency
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
|
||||
// Register a buffer with the McastDeviceMemory class. This function does not check if the ptr belongs to the buffer!
|
||||
@ -31,4 +35,6 @@ void unregisterMcastDevMemBuffer(McastDeviceMemory* buf);
|
||||
// information. Thus a derived pointer cannot used as the key.
|
||||
McastDeviceMemory* findMcastDevMemBuffer(void* ptr);
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/memoryUtils.h"
|
||||
@ -25,8 +26,8 @@
|
||||
|
||||
#include <sanitizer/asan_interface.h>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -961,4 +962,5 @@ void calcAlignedPointers(
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,13 +16,14 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaFp8Utils.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -293,4 +294,5 @@ AlignedPointersUnpacker inline calcAlignedPointers(
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
|
||||
@ -46,7 +47,9 @@
|
||||
#include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
namespace tensorrt_llm::common::nccl_util
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::nccl_util
|
||||
{
|
||||
|
||||
//==============================================================================
|
||||
@ -392,6 +395,8 @@ inline std::pair<torch::Tensor, NCCLWindowBuffer> createNCCLWindowTensor(
|
||||
return std::make_pair(tensor, buffer);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common::nccl_util
|
||||
} // namespace common::nccl_util
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
#endif // ENABLE_MULTI_DEVICE
|
||||
|
||||
@ -25,10 +25,13 @@
|
||||
#if defined(__clang__)
|
||||
#pragma clang diagnostic pop
|
||||
#endif
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <array>
|
||||
|
||||
namespace tensorrt_llm::common::nvtx
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::nvtx
|
||||
{
|
||||
inline nvtx3::color nextColor()
|
||||
{
|
||||
@ -46,8 +49,9 @@ inline nvtx3::color nextColor()
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common::nvtx
|
||||
} // namespace common::nvtx
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
#define NVTX3_SCOPED_RANGE_WITH_NAME(range, name) \
|
||||
::nvtx3::scoped_range range(::tensorrt_llm::common::nvtx::nextColor(), name)
|
||||
#define NVTX3_SCOPED_RANGE(range) NVTX3_SCOPED_RANGE_WITH_NAME(range##_range, #range)
|
||||
|
||||
@ -29,6 +29,7 @@
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
|
||||
std::unordered_map<nvinfer1::DataType, ncclDataType_t>* getDtypeMap()
|
||||
@ -378,3 +379,5 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
|
||||
});
|
||||
return creator();
|
||||
}
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cublasMMWrapper.h"
|
||||
#include "tensorrt_llm/common/workspace.h"
|
||||
|
||||
@ -37,7 +38,9 @@
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace tensorrt_llm::common::op
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::op
|
||||
{
|
||||
|
||||
// Write values into buffer
|
||||
@ -178,7 +181,7 @@ struct hash
|
||||
|
||||
// for testing only
|
||||
void const* getCommSessionHandle();
|
||||
} // namespace tensorrt_llm::common::op
|
||||
} // namespace common::op
|
||||
|
||||
inline bool isBuilding()
|
||||
{
|
||||
@ -220,6 +223,8 @@ std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group);
|
||||
std::shared_ptr<cublasHandle_t> getCublasHandle();
|
||||
std::shared_ptr<cublasLtHandle_t> getCublasLtHandle();
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
#ifndef DEBUG
|
||||
|
||||
#define PLUGIN_CHECK(status) \
|
||||
|
||||
@ -16,14 +16,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
|
||||
#include "tensorrt_llm/common/cudaFp8Utils.h"
|
||||
#include <cuda.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <float.h>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -52,4 +53,5 @@ struct QuantTypeStaticVals<__nv_fp8_e4m3>
|
||||
#endif // ENABLE_FP8
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
#else
|
||||
#include <cooperative_groups.h>
|
||||
#endif
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaTypeUtils.cuh"
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
@ -30,8 +31,8 @@
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
@ -423,4 +424,5 @@ __device__ __forceinline__ half clamp_inf_for_half(float const input)
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
#include "safetensors.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <NvInferRuntime.h>
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
@ -25,7 +26,9 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace tensorrt_llm::common::safetensors
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::safetensors
|
||||
{
|
||||
using nvinfer1::DataType;
|
||||
|
||||
@ -164,4 +167,6 @@ std::shared_ptr<ISafeTensor> ISafeTensor::open(char const* filename)
|
||||
{
|
||||
return std::make_shared<SafeTensor>(filename);
|
||||
}
|
||||
} // namespace tensorrt_llm::common::safetensors
|
||||
} // namespace common::safetensors
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#pragma once
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include <NvInferRuntime.h>
|
||||
#include <cstdint>
|
||||
@ -23,7 +24,9 @@
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
namespace tensorrt_llm::common::safetensors
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::safetensors
|
||||
{
|
||||
class INdArray
|
||||
{
|
||||
@ -58,4 +61,6 @@ public:
|
||||
virtual ~ISafeTensor() = default;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common::safetensors
|
||||
} // namespace common::safetensors
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,12 +16,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
|
||||
namespace tensorrt_llm::common::stl_utils
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::stl_utils
|
||||
{
|
||||
|
||||
template <typename TInputIt, typename TOutputIt, typename TBinOp>
|
||||
@ -120,4 +123,6 @@ std::string toString(std::optional<T> const& t, typename std::enable_if_t<HasOpe
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common::stl_utils
|
||||
} // namespace common::stl_utils
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <cerrno>
|
||||
#include <cstdarg>
|
||||
@ -23,7 +24,9 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args)
|
||||
@ -73,4 +76,6 @@ std::unordered_set<std::string> str2set(std::string const& input, char delimiter
|
||||
return values;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,13 +14,16 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <chrono>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
|
||||
#include "tensorrt_llm/common/timestampUtils.h"
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
std::string getCurrentTimestamp()
|
||||
@ -39,4 +42,6 @@ std::string getCurrentTimestamp()
|
||||
return stream.str();
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,12 +14,17 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
/// @brief Get the current timestamp in the format "MM-DD-YYYY HH:MM:SS:uuuuuu"
|
||||
std::string getCurrentTimestamp();
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/stringUtils.h"
|
||||
|
||||
#include <cinttypes>
|
||||
@ -26,7 +27,9 @@
|
||||
#endif
|
||||
#include <sstream>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
namespace
|
||||
@ -128,4 +131,6 @@ RequestErrorCode RequestSpecificException::getErrorCode() const noexcept
|
||||
return mErrorCode;
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,10 +14,13 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
// CuBLAS >= 12.9.1 requires 256-byte alignment.
|
||||
@ -85,4 +88,6 @@ inline size_t calculateTotalWorkspaceSize(
|
||||
return total;
|
||||
}
|
||||
|
||||
}; // namespace tensorrt_llm::common
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -18,10 +18,11 @@
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include "cutlass/device_kernel.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace cutlass_extensions
|
||||
{
|
||||
|
||||
@ -85,4 +86,5 @@ inline int compute_occupancy_for_kernel()
|
||||
}
|
||||
|
||||
} // namespace cutlass_extensions
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -30,10 +30,11 @@
|
||||
#include "cutlass/epilogue/thread/linear_combination_relu.h"
|
||||
#include "cutlass/epilogue/thread/linear_combination_silu.h"
|
||||
#include "cutlass_extensions/epilogue/thread/fused_activations.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <cutlass/epilogue/fusion/operations.hpp>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace cutlass_extensions
|
||||
{
|
||||
|
||||
@ -150,4 +151,5 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
|
||||
};
|
||||
|
||||
} // namespace cutlass_extensions
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -24,10 +24,11 @@
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/tllmException.h"
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace cutlass_extensions
|
||||
{
|
||||
|
||||
@ -535,4 +536,5 @@ inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& conf
|
||||
}
|
||||
|
||||
} // namespace cutlass_extensions
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -319,6 +319,10 @@ AgentConnection const* AgentConnectionManager::recvConnectionAndRequestInfo(batc
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
if (!mIsRunning)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
updateUnhandledNotifications();
|
||||
std::scoped_lock lock(mNotificationMutex);
|
||||
auto it = mUnhandledNotifications.begin();
|
||||
@ -491,6 +495,11 @@ void AgentConnectionManager::waitForNotification(std::string const& remoteAgentN
|
||||
while (true)
|
||||
{
|
||||
|
||||
if (!mIsRunning)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
updateUnhandledNotifications();
|
||||
std::scoped_lock lock(mNotificationMutex);
|
||||
auto it = mUnhandledNotifications.begin();
|
||||
@ -587,6 +596,13 @@ std::string const& AgentConnectionManager::getAgentName() const
|
||||
|
||||
AgentConnectionManager::~AgentConnectionManager()
|
||||
{
|
||||
mIsRunning = false;
|
||||
m_Agent->deregisterMemory(mRegMemDescs);
|
||||
}
|
||||
|
||||
bool AgentConnectionManager::isRunning() const
|
||||
{
|
||||
return mIsRunning;
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::executor::kv_cache
|
||||
|
||||
@ -296,6 +296,7 @@ public:
|
||||
void waitForNotification(std::string const& remoteAgentName, NotificationType& expectedInfo);
|
||||
void waitForSyncInfo(std::string const& remoteAgentName, NotificationSyncInfo& syncInfo);
|
||||
void waitForReadySignal(std::string const& remoteAgentName, ReadySignalInfo& readySignalInfo);
|
||||
[[nodiscard]] bool isRunning() const override;
|
||||
|
||||
private:
|
||||
std::map<std::string, std::shared_ptr<AgentConnection>> mConnections;
|
||||
@ -309,6 +310,7 @@ private:
|
||||
int mDeviceId;
|
||||
std::string mAgentName;
|
||||
MemoryDescs mRegMemDescs;
|
||||
std::atomic<bool> mIsRunning{true};
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::executor::kv_cache
|
||||
|
||||
@ -77,4 +77,13 @@ CommState const& MpiConnectionManager::getCommState() const
|
||||
return mCommState;
|
||||
}
|
||||
|
||||
bool MpiConnectionManager::isRunning() const
|
||||
{
|
||||
return mIsRunning;
|
||||
}
|
||||
|
||||
MpiConnectionManager::~MpiConnectionManager()
|
||||
{
|
||||
mIsRunning = false;
|
||||
}
|
||||
} // namespace tensorrt_llm::executor::kv_cache
|
||||
|
||||
@ -42,14 +42,17 @@ class MpiConnectionManager : public ConnectionManager
|
||||
{
|
||||
public:
|
||||
MpiConnectionManager(mpi::MpiComm const* comm);
|
||||
~MpiConnectionManager();
|
||||
MpiConnection const* recvConnect(DataContext const& ctx, void* data, size_t size) override;
|
||||
[[nodiscard]] std::vector<Connection const*> getConnections(CommState const& state) override;
|
||||
[[nodiscard]] CommState const& getCommState() const override;
|
||||
[[nodiscard]] bool isRunning() const override;
|
||||
|
||||
private:
|
||||
mpi::MpiComm const* mComm;
|
||||
std::map<int, MpiConnection> mConnections;
|
||||
CommState mCommState;
|
||||
std::atomic<bool> mIsRunning{true};
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::executor::kv_cache
|
||||
|
||||
@ -454,21 +454,10 @@ void NixlTransferAgent::invalidateRemoteAgent(std::string const& name)
|
||||
|
||||
void NixlTransferAgent::notifySyncMessage(std::string const& name, SyncMessage const& syncMessage)
|
||||
{
|
||||
if (name == mName)
|
||||
{
|
||||
// FIXME: nixl does not support gen notif to itself ,but support local transfer. we use local transfer to notify
|
||||
// itself
|
||||
MemoryDescs descs{MemoryType::kDRAM, {MemoryDesc{mDRamSrcBuffer}, MemoryDesc{mDRamDstBuffer}}};
|
||||
TransferRequest request{TransferOp::kWRITE, descs, descs, name, syncMessage};
|
||||
auto request_status = submitTransferRequests(request);
|
||||
request_status->wait();
|
||||
}
|
||||
else
|
||||
{
|
||||
auto status = mRawAgent->genNotif(name, syncMessage);
|
||||
TLLM_CHECK_WITH_INFO(
|
||||
status == NIXL_SUCCESS, "genNotif failed with status: %s", nixlEnumStrings::statusStr(status).c_str());
|
||||
}
|
||||
|
||||
auto status = mRawAgent->genNotif(name, syncMessage);
|
||||
TLLM_CHECK_WITH_INFO(
|
||||
status == NIXL_SUCCESS, "genNotif failed with status: %s", nixlEnumStrings::statusStr(status).c_str());
|
||||
}
|
||||
|
||||
[[nodiscard]] std::unordered_map<std::string, std::vector<SyncMessage>> NixlTransferAgent::getNotifiedSyncMessages()
|
||||
|
||||
@ -504,7 +504,7 @@ UcxConnectionManager::~UcxConnectionManager()
|
||||
socket.close();
|
||||
mZmqRepThread.join();
|
||||
}
|
||||
|
||||
mIsRunning = false;
|
||||
mZmqRepSocket.close();
|
||||
|
||||
mZmqContext.close();
|
||||
@ -673,6 +673,11 @@ std::vector<Connection const*> UcxConnectionManager::getConnections(CommState co
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool UcxConnectionManager::isRunning() const
|
||||
{
|
||||
return mIsRunning;
|
||||
}
|
||||
|
||||
CommState const& UcxConnectionManager::getCommState() const
|
||||
{
|
||||
return mCommState;
|
||||
|
||||
@ -62,6 +62,7 @@ private:
|
||||
zmq::socket_t mZmqRepSocket;
|
||||
std::string mZmqRepEndpoint;
|
||||
std::thread mZmqRepThread;
|
||||
std::atomic<bool> mIsRunning{true};
|
||||
|
||||
UcxConnection::ConnectionIdType getNewConnectionId(std::shared_ptr<ucxx::Endpoint> const& newEp);
|
||||
UcxConnection::ConnectionIdType addConnection(std::string const& ip, uint16_t port);
|
||||
@ -85,6 +86,8 @@ public:
|
||||
{
|
||||
return mRank;
|
||||
}
|
||||
|
||||
[[nodiscard]] bool isRunning() const override;
|
||||
};
|
||||
|
||||
#if defined(__clang__)
|
||||
|
||||
@ -52,7 +52,8 @@ namespace tensorrt_llm::executor
|
||||
namespace
|
||||
{
|
||||
|
||||
[[nodiscard]] bool executorConfigIsValid(ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
|
||||
[[nodiscard]] bool executorConfigIsValid(
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
|
||||
{
|
||||
// Make sure logic in this function matches fixExecutorConfig
|
||||
if (executorConfig.getEnableChunkedContext())
|
||||
@ -65,8 +66,8 @@ namespace
|
||||
return true;
|
||||
}
|
||||
|
||||
[[nodiscard]] ExecutorConfig fixExecutorConfig(
|
||||
ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
|
||||
[[nodiscard]] ::tensorrt_llm::executor::ExecutorConfig fixExecutorConfig(
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
|
||||
{
|
||||
// Make sure logic in this function matches executorConfigIsValid
|
||||
auto fixedExecutorConfig = executorConfig;
|
||||
@ -241,7 +242,7 @@ private:
|
||||
|
||||
void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& modelPathOpt,
|
||||
std::optional<BufferView> const& engineBufferOpt, runtime::GptJsonConfig const& jsonConfig,
|
||||
ExecutorConfig const& executorConfig, bool isEncoder,
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, bool isEncoder,
|
||||
std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
|
||||
{
|
||||
auto const gpusPerNode = jsonConfig.getGpusPerNode();
|
||||
@ -288,7 +289,7 @@ void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& model
|
||||
|
||||
Executor::Impl::Impl(std::filesystem::path const& modelPath,
|
||||
std::optional<std::filesystem::path> const& encoderModelPath, ModelType const modelType,
|
||||
ExecutorConfig const& executorConfig)
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
|
||||
{
|
||||
auto decoderJsonConfig = runtime::GptJsonConfig::parse(modelPath / "config.json");
|
||||
|
||||
@ -329,7 +330,7 @@ Executor::Impl::Impl(std::filesystem::path const& modelPath,
|
||||
|
||||
Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& jsonConfigStr,
|
||||
std::optional<BufferView> const& encoderEngineBufferView, std::optional<std::string> const& encoderJsonConfigStr,
|
||||
ModelType const modelType, ExecutorConfig const& executorConfig,
|
||||
ModelType const modelType, ::tensorrt_llm::executor::ExecutorConfig const& executorConfig,
|
||||
std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
|
||||
{
|
||||
auto decoderJsonConfig = runtime::GptJsonConfig::parse(jsonConfigStr);
|
||||
@ -367,7 +368,7 @@ Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& json
|
||||
}
|
||||
|
||||
Executor::Impl::Impl(std::shared_ptr<Model> model, std::optional<std::shared_ptr<Model>> encoderModel,
|
||||
ExecutorConfig const& executorConfig)
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
|
||||
{
|
||||
auto const& worldConfig = model->getWorldConfig();
|
||||
auto const tp = worldConfig.getTensorParallelism();
|
||||
@ -388,7 +389,7 @@ Executor::Impl::~Impl()
|
||||
shutdown();
|
||||
}
|
||||
|
||||
void Executor::Impl::initialize(ExecutorConfig const& executorConfig)
|
||||
void Executor::Impl::initialize(::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
|
||||
@ -484,7 +485,7 @@ void Executor::Impl::initialize(ExecutorConfig const& executorConfig)
|
||||
|
||||
std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& rawEngine,
|
||||
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
|
||||
ExecutorConfig const& executorConfig)
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
|
||||
{
|
||||
auto const gptModelType = [&executorConfig, &modelConfig]()
|
||||
{
|
||||
@ -512,7 +513,7 @@ std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& raw
|
||||
|
||||
std::shared_ptr<Model> Executor::Impl::createEncoderModel(runtime::RawEngine const& rawEngine,
|
||||
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
|
||||
ExecutorConfig const& executorConfig)
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
|
||||
{
|
||||
auto fixedExecutorConfig = ExecutorConfig{};
|
||||
fixedExecutorConfig.setSchedulerConfig(executorConfig.getSchedulerConfig());
|
||||
@ -579,7 +580,7 @@ void Executor::Impl::setOrchLeaderComm(
|
||||
}
|
||||
|
||||
void Executor::Impl::initializeCommAndWorkers(SizeType32 tp, SizeType32 pp, SizeType32 cp,
|
||||
ExecutorConfig const& executorConfig, std::optional<ModelType> modelType,
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, std::optional<ModelType> modelType,
|
||||
std::optional<std::filesystem::path> const& modelPath, std::optional<runtime::WorldConfig> const& worldConfig,
|
||||
std::optional<runtime::GptJsonConfig> const& decoderGptJsonConfig)
|
||||
{
|
||||
@ -638,7 +639,7 @@ void Executor::Impl::validateParallelConfig(ParallelConfig const& parallelConfig
|
||||
}
|
||||
|
||||
void Executor::Impl::initializeOrchestrator(SizeType32 tp, SizeType32 pp, SizeType32 cp,
|
||||
ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType,
|
||||
::tensorrt_llm::executor::ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType,
|
||||
std::filesystem::path const& modelPath)
|
||||
{
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
|
||||
@ -2333,6 +2333,7 @@ size_t Serialization::serializedSize(KVCacheStoredBlockData const& data)
|
||||
totalSize += su::serializedSize(data.loraId);
|
||||
totalSize += su::serializedSize(data.cacheLevel);
|
||||
totalSize += su::serializedSize(data.priority);
|
||||
totalSize += su::serializedSize(data.mmKeys);
|
||||
return totalSize;
|
||||
}
|
||||
|
||||
@ -2343,6 +2344,7 @@ void Serialization::serialize(KVCacheStoredBlockData const& data, std::ostream&
|
||||
su::serialize(data.loraId, os);
|
||||
su::serialize(data.cacheLevel, os);
|
||||
su::serialize(data.priority, os);
|
||||
su::serialize(data.mmKeys, os);
|
||||
}
|
||||
|
||||
KVCacheStoredBlockData Serialization::deserializeKVCacheStoredBlockData(std::istream& is)
|
||||
@ -2352,8 +2354,9 @@ KVCacheStoredBlockData Serialization::deserializeKVCacheStoredBlockData(std::ist
|
||||
auto loraId = su::deserialize<std::optional<tensorrt_llm::runtime::LoraTaskIdType>>(is);
|
||||
auto cacheLevel = su::deserialize<SizeType32>(is);
|
||||
auto priority = su::deserialize<SizeType32>(is);
|
||||
auto mmKeys = su::deserialize<std::vector<tensorrt_llm::batch_manager::kv_cache_manager::MmKey>>(is);
|
||||
|
||||
return KVCacheStoredBlockData{blockHash, tokens, loraId, cacheLevel, priority};
|
||||
return KVCacheStoredBlockData{blockHash, tokens, loraId, cacheLevel, priority, mmKeys};
|
||||
}
|
||||
|
||||
// KVcacheRemovedData
|
||||
|
||||
@ -40,9 +40,7 @@ list(FILTER SRC_CU EXCLUDE REGEX "fusedLayernormKernels/.*")
|
||||
|
||||
function(filter_cuda_archs ARCH SOURCES_VAR)
|
||||
if(NOT "${ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
|
||||
set(FILTER_REGEX
|
||||
".*_sm(_)?${ARCH}[.]cubin[.]cpp|^.*Sm(_)?${ARCH}.*cubin.cpp$|.*_sm(_)?${ARCH}[.]cu|^.*Sm(_)?${ARCH}.*cu$"
|
||||
)
|
||||
set(FILTER_REGEX ".*[Ss][Mm]_?${ARCH}(af)?.*(cubin\.cpp|\.cu)$")
|
||||
list(APPEND SOURCES ${${SOURCES_VAR}})
|
||||
list(APPEND SOURCES_FILTERED ${SOURCES})
|
||||
list(FILTER SOURCES_FILTERED INCLUDE REGEX "${FILTER_REGEX}")
|
||||
|
||||
@ -16,9 +16,12 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
|
||||
namespace tensorrt_llm::kernels
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_scale_bytes, uint8_t* k_cache,
|
||||
@ -28,3 +31,5 @@ void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_sca
|
||||
cudaStream_t stream = 0);
|
||||
|
||||
}
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -17,12 +17,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
|
||||
namespace tensorrt_llm::kernels
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux,
|
||||
int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0,
|
||||
@ -32,4 +35,6 @@ void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int con
|
||||
int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048,
|
||||
cudaStream_t const stream = 0);
|
||||
|
||||
} // namespace tensorrt_llm::kernels
|
||||
} // namespace kernels
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
*/
|
||||
#include "attentionMask.h"
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaBf16Wrapper.h"
|
||||
#include "tensorrt_llm/common/cudaFp8Utils.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
@ -24,8 +25,8 @@
|
||||
|
||||
using namespace tensorrt_llm::common;
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -231,4 +232,5 @@ template void invokeBuildAttentionMask(AttentionMaskParams<__nv_fp8_e4m3> const&
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/kernels/gptKernels.h"
|
||||
#include "tensorrt_llm/runtime/iTensor.h"
|
||||
@ -25,8 +26,8 @@
|
||||
|
||||
namespace tc = tensorrt_llm::common;
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -64,4 +65,5 @@ template <typename MaskDataType>
|
||||
void invokeBuildAttentionMask(AttentionMaskParams<MaskDataType> const& params, cudaStream_t stream);
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -14,14 +14,15 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/kernels/banBadWords.h"
|
||||
|
||||
using namespace tensorrt_llm::common;
|
||||
using namespace tensorrt_llm::runtime;
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -130,4 +131,5 @@ template void invokeBanBadWords(float* logits, TokenIdType const** output_ids_pt
|
||||
SizeType32 const* sequence_lengths, SizeType32 max_seq_len, cudaStream_t stream);
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
@ -16,12 +16,13 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/runtime/common.h"
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace tensorrt_llm
|
||||
{
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace kernels
|
||||
{
|
||||
|
||||
@ -34,4 +35,5 @@ void invokeBanBadWords(T* logits, runtime::TokenIdType const** output_ids_ptr,
|
||||
cudaStream_t stream);
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user