From 98424f3186a107fe60e71b9dedf01b4cf8f2a4a9 Mon Sep 17 00:00:00 2001 From: Yiqing Yan Date: Wed, 6 Aug 2025 18:19:03 +0800 Subject: [PATCH 001/186] [TRTLLM-5633][infra] Change the TOT repo to default-llm-repo for merge waive list (#6605) Signed-off-by: Yiqing Yan Co-authored-by: Yanchao Lu --- jenkins/L0_MergeRequest.groovy | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index cefb06508c..48e68efea3 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -339,7 +339,9 @@ def mergeWaiveList(pipeline, globalVars) LLM_TOT_ROOT = "llm-tot" targetBranch = env.gitlabTargetBranch ? env.gitlabTargetBranch : globalVars[TARGET_BRANCH] echo "Target branch: ${targetBranch}" - trtllm_utils.checkoutSource(LLM_REPO, targetBranch, LLM_TOT_ROOT, true, true) + withCredentials([string(credentialsId: 'default-sync-llm-repo', variable: 'DEFAULT_SYNC_LLM_REPO')]) { + trtllm_utils.checkoutSource(DEFAULT_SYNC_LLM_REPO, targetBranch, LLM_TOT_ROOT, false, false) + } targetBranchTOTCommit = sh (script: "cd ${LLM_TOT_ROOT} && git rev-parse HEAD", returnStdout: true).trim() echo "Target branch TOT commit: ${targetBranchTOTCommit}" sh "cp ${LLM_TOT_ROOT}/tests/integration/test_lists/waives.txt ./waives_TOT_${targetBranchTOTCommit}.txt" From b7347ce7d1537a1d449dc712edc9602786f57f9d Mon Sep 17 00:00:00 2001 From: Yanchao Lu Date: Wed, 6 Aug 2025 18:50:53 +0800 Subject: [PATCH 002/186] [https://nvbugs/5433581][fix] Revert deep_gemm installation workaround for SBSA (#6666) Signed-off-by: Yanchao Lu --- docs/source/installation/linux.md | 5 ----- jenkins/L0_Test.groovy | 12 ------------ 2 files changed, 17 deletions(-) diff --git a/docs/source/installation/linux.md b/docs/source/installation/linux.md index ab471e8c1d..9262453b66 100644 --- a/docs/source/installation/linux.md +++ b/docs/source/installation/linux.md @@ -16,11 +16,6 @@ # Optional step: Only required for NVIDIA Blackwell GPUs and SBSA platform pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 - # Optional step: Workaround for deep_gemm installation failure on SBSA platform - # The actual deep_gemm package and version should be obtained from the requirements.txt file. - pip3 install 'deep_gemm @ git+https://github.com/zongfeijing/DeepGEMM.git@a9d538ef4dff0326fe521c6ca0bfde115703b56a' \ - --extra-index-url https://download.pytorch.org/whl/cu128 - sudo apt-get -y install libopenmpi-dev ``` diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 9bf77571ba..b40c7a11a7 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2106,18 +2106,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128") } - // Workaround for https://nvbugs/5433581 where deep_gemm installation fails on SBSA platform - if (cpu_arch == AARCH64_TRIPLE) { - echo "###### Workaround for https://nvbugs/5433581 Start ######" - def deepGemmLine = readFile("${LLM_ROOT}/requirements.txt").readLines().find { it.trim().startsWith('deep_gemm') } - if (deepGemmLine) { - trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install '${deepGemmLine.trim()}' --extra-index-url https://download.pytorch.org/whl/cu128") - } - else { - echo "deep_gemm package not found in requirements.txt" - } - } - def libEnv = [] if (env.alternativeTRT) { stage("Replace TensorRT") { From 79fc2f48c0443db5678d8b075bb2f2899c225e37 Mon Sep 17 00:00:00 2001 From: Pengyun Lin <81065165+LinPoly@users.noreply.github.com> Date: Wed, 6 Aug 2025 20:30:35 +0800 Subject: [PATCH 003/186] [None][chore] Enhance trtllm-serve example test (#6604) Signed-off-by: Pengyun Lin <81065165+LinPoly@users.noreply.github.com> --- .../openai_completion_client_json_schema.py | 14 +++++- .../llmapi/apps/_test_trtllm_serve_example.py | 44 ++++++++++++++++--- 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/examples/serve/openai_completion_client_json_schema.py b/examples/serve/openai_completion_client_json_schema.py index 2f110270f5..56e5a351a0 100644 --- a/examples/serve/openai_completion_client_json_schema.py +++ b/examples/serve/openai_completion_client_json_schema.py @@ -1,5 +1,9 @@ ### :title OpenAI Completion Client with JSON Schema +# This example requires to specify `guided_decoding_backend` as +# `xgrammar` or `llguidance` in the extra_llm_api_options.yaml file. +import json + from openai import OpenAI client = OpenAI( @@ -18,7 +22,6 @@ response = client.chat.completions.create( "content": f"Give me the information of the biggest city of China in the JSON format.", }], - max_tokens=100, temperature=0, response_format={ "type": "json", @@ -39,4 +42,11 @@ response = client.chat.completions.create( } }, ) -print(response.choices[0].message.content) + +content = response.choices[0].message.content +try: + response_json = json.loads(content) + assert "name" in response_json and "population" in response_json + print(content) +except json.JSONDecodeError: + print("Failed to decode JSON response") diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py index 262eafa820..6921c024d5 100644 --- a/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py +++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py @@ -1,8 +1,11 @@ +import json import os import subprocess import sys +import tempfile import pytest +import yaml from .openai_server import RemoteOpenAIServer @@ -16,10 +19,26 @@ def model_name(): @pytest.fixture(scope="module") -def server(model_name: str): +def temp_extra_llm_api_options_file(): + temp_dir = tempfile.gettempdir() + temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml") + try: + extra_llm_api_options_dict = {"guided_decoding_backend": "xgrammar"} + with open(temp_file_path, 'w') as f: + yaml.dump(extra_llm_api_options_dict, f) + + yield temp_file_path + finally: + if os.path.exists(temp_file_path): + os.remove(temp_file_path) + + +@pytest.fixture(scope="module") +def server(model_name: str, temp_extra_llm_api_options_file: str): model_path = get_model_path(model_name) # fix port to facilitate concise trtllm-serve examples - with RemoteOpenAIServer(model_path, port=8000) as remote_server: + args = ["--extra_llm_api_options", temp_extra_llm_api_options_file] + with RemoteOpenAIServer(model_path, args, port=8000) as remote_server: yield remote_server @@ -40,8 +59,19 @@ def test_trtllm_serve_examples(exe: str, script: str, server: RemoteOpenAIServer, example_root: str): client_script = os.path.join(example_root, script) # CalledProcessError will be raised if any errors occur - subprocess.run([exe, client_script], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - check=True) + result = subprocess.run([exe, client_script], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True) + if script.startswith("curl"): + # For curl scripts, we expect a JSON response + result_stdout = result.stdout.strip() + try: + data = json.loads(result_stdout) + assert "code" not in data or data[ + "code"] == 200, f"Unexpected response: {data}" + except json.JSONDecodeError as e: + pytest.fail( + f"Failed to parse JSON response from {script}: {e}\nStdout: {result_stdout}\nStderr: {result.stderr}" + ) From 13ecb4aced53f4cb07babe70b6b558cf9626f7fc Mon Sep 17 00:00:00 2001 From: Iman Tabrizian <10105175+Tabrizian@users.noreply.github.com> Date: Wed, 6 Aug 2025 06:08:29 -0700 Subject: [PATCH 004/186] [https://nvbugs/5328160][fix] Unwaive disaggregated serving tests (#6644) Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index ad2c7efd8f..f5f55f8f3c 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -169,7 +169,6 @@ examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padd examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058) examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5234058) examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5234058) -disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5247271) full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837) full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837) full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837) @@ -210,7 +209,6 @@ perf/test_perf.py::test_perf[bart_large_cnn-bench-float16-input_output_len:128,2 perf/test_perf.py::test_perf[mamba_130m-bench-float16-input_output_len:128,128] SKIP (https://nvbugspro.nvidia.com/bug/5295411) perf/test_perf.py::test_perf[bert_large-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411) perf/test_perf.py::test_perf[roberta_base-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411) -disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5328160) stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test] SKIP (https://nvbugs/5328495) full:B200/examples/test_gemma.py::test_llm_gemma_1gpu_summary_vswa[gemma-3-1b-it-other-bfloat16-8] SKIP (https://nvbugs/5292737) full:B200/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5295470) From 1aed7511fe77c70b156fa37882f091c00f13ddaa Mon Sep 17 00:00:00 2001 From: Yechan Kim <161688079+yechank-nvidia@users.noreply.github.com> Date: Wed, 6 Aug 2025 22:58:58 +0900 Subject: [PATCH 005/186] [https://nvbugs/5430124][fix] Mistral mixture_text_image test case fix (#6648) Signed-off-by: yechank <161688079+yechank-nvidia@users.noreply.github.com> --- tests/integration/defs/test_e2e.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index c9d13f31fc..a7d45fb37f 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2165,8 +2165,8 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, "Describe the scene in the image briefly.", ], "media": [ - [], - [str(test_data_root / "inpaint.png")], + "", + str(test_data_root / "inpaint.png"), ], } } @@ -2226,7 +2226,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, ], "mixture_text_image": [["invention", "person", "scientists", "Lick", "engineers"], - ["landscape", "dome", "yosemite", "altitude", "scattered"]] + ["landscape", "trees", "road", "natural", "rock"]] }, "gemma-3-27b-it": { "image": [ From 5eae3184fa8bbbf1c2d8b8c9db07f5322a9ec620 Mon Sep 17 00:00:00 2001 From: Yan Chunwei <328693+Superjomn@users.noreply.github.com> Date: Wed, 6 Aug 2025 22:12:27 +0800 Subject: [PATCH 006/186] [None][chore] add missing tests to test list (#6590) Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com> --- tests/integration/test_lists/test-db/l0_a10.yml | 5 +++++ tests/unittest/llmapi/test_utils.py | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index 6caee5b69f..891649e5b9 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -99,6 +99,11 @@ l0_a10: - unittest/test_model_runner_cpp.py - unittest/llmapi/test_build_cache.py - unittest/llmapi/test_llm_utils.py + - unittest/llmapi/test_gc_utils.py + - unittest/llmapi/test_reasoning_parser.py + - unittest/llmapi/test_serialization.py + - unittest/llmapi/test_utils.py + - unittest/llmapi/test_llm_args.py - accuracy/test_cli_flow.py::TestGpt2::test_auto_dtype # 1 min - accuracy/test_cli_flow.py::TestGpt2::test_beam_search # 1 min - accuracy/test_cli_flow.py::TestGpt2::test_beam_search_large # 6 mins diff --git a/tests/unittest/llmapi/test_utils.py b/tests/unittest/llmapi/test_utils.py index d742283ca5..fc5876cdb1 100644 --- a/tests/unittest/llmapi/test_utils.py +++ b/tests/unittest/llmapi/test_utils.py @@ -13,7 +13,9 @@ def test_api_status_registry(): def _my_method(self, *args, **kwargs): pass - assert ApiStatusRegistry.get_api_status(_my_method) == "prototype" + # will always keep the first status, and the behaviour will be unknown if + # one method is registered with a different status in different files. + assert ApiStatusRegistry.get_api_status(_my_method) == "beta" class App: From 3a71ddfe0926fd4f5f3ad93eb6195e14eb84ad21 Mon Sep 17 00:00:00 2001 From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Date: Wed, 6 Aug 2025 22:13:54 +0800 Subject: [PATCH 007/186] [TRTLLM-6859][doc] Add DeepSeek R1 deployment guide. (#6579) Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- ...start-recipe-for-deepseek-r1-on-trt-llm.md | 386 ++++++++++++++++++ 2 files changed, 387 insertions(+), 1 deletion(-) create mode 100644 examples/models/core/deepseek_v3/quick-start-recipe-for-deepseek-r1-on-trt-llm.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 00516b1afa..74f830f07d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: args: [--allow-multiple-documents] exclude: ".*/gitlab/.*.yml" - id: trailing-whitespace - exclude: '\.patch$' + exclude: '\.(patch|md)$' - id: check-toml - id: mixed-line-ending args: [--fix=lf] diff --git a/examples/models/core/deepseek_v3/quick-start-recipe-for-deepseek-r1-on-trt-llm.md b/examples/models/core/deepseek_v3/quick-start-recipe-for-deepseek-r1-on-trt-llm.md new file mode 100644 index 0000000000..6c019c8a86 --- /dev/null +++ b/examples/models/core/deepseek_v3/quick-start-recipe-for-deepseek-r1-on-trt-llm.md @@ -0,0 +1,386 @@ +# Quick Start Recipe for DeepSeek R1 on TensorRT-LLM - Blackwell & Hopper Hardware + +## Introduction + +This deployment guide provides step-by-step instructions for running the DeepSeek R1 model using TensorRT-LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT-LLM parameters, launching the server, and validating inference output. + +The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIA’s accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT-LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution. + +## Prerequisites + +GPU: NVIDIA Blackwell or Hopper Architecture +OS: Linux +Drivers: CUDA Driver 575 or Later +Docker with NVIDIA Container Toolkit installed +Python3 and python3-pip (Optional, for accuracy evaluation only) + +## Models + +* FP8 model: [DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528) +* NVFP4 model: [DeepSeek-R1-0528-FP4](https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4) + + +Note that NVFP4 is only supported on NVIDIA Blackwell platform. + +## Deployment Steps + +### Run Docker Container + +Run the docker container using the TensorRT-LLM NVIDIA NGC image. + +```shell +docker run --rm -it \ +--ipc=host \ +--gpus all \ +-p 8000:8000 \ +-v ~/.cache:/root/.cache:rw \ +--name tensorrt_llm \ +nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc5 \ +/bin/bash +``` + +Note: + +* You can mount additional directories and paths using the \-v \:\ flag if needed, such as mounting the downloaded weight paths. +* The command mounts your user .cache directory to save the downloaded model checkpoints which are saved to \~/.cache/huggingface/hub/ by default. This prevents having to redownload the weights each time you rerun the container. If the \~/.cache directory doesn’t exist please create it using mkdir \~/.cache +* The command also maps port **8000** from the container to your host so you can access the LLM API endpoint from your host +* See the [https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) for all the available containers. The containers published in the main branch weekly have “rcN” suffix, while the monthly release with QA tests has no “rcN” suffix. Use the rc release to get the latest model and feature support. + +If you want to use latest main branch, you can choose to build from source to install TensorRT-LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) + +### Creating the TRT-LLM Server config + +We create a YAML configuration file /tmp/config.yml for the TensorRT-LLM Server and populate it with the following recommended performance settings. + +```shell +EXTRA_LLM_API_FILE=/tmp/config.yml + +cat << EOF > ${EXTRA_LLM_API_FILE} +enable_attention_dp: true +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +kv_cache_config: + dtype: fp8 +stream_interval: 10 +speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 +EOF +``` + +For FP8 model, we need extra `moe_config`: + +```shell +EXTRA_LLM_API_FILE=/tmp/config.yml + +cat << EOF > ${EXTRA_LLM_API_FILE} +enable_attention_dp: true +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +kv_cache_config: + dtype: fp8 +stream_interval: 10 +speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 1 +moe_config: + backend: DEEPGEMM + max_num_tokens: 3200 +EOF +``` + +### Launch the TRT-LLM Server + +Below is an example command to launch the TRT-LLM server with the DeepSeek-R1 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section. + +```shell +trtllm-serve deepseek-ai/DeepSeek-R1-0528 \ + --host 0.0.0.0 \ + --port 8000 \ + --backend pytorch \ + --max_batch_size 1024 \ + --max_num_tokens 3200 \ + --max_seq_len 2048 \ + --kv_cache_free_gpu_memory_fraction 0.8 \ + --tp_size 8 \ + --ep_size 8 \ + --trust_remote_code \ + --extra_llm_api_options ${EXTRA_LLM_API_FILE} +``` + +After the server is set up, the client can now send prompt requests to the server and receive results. + +### Configs and Parameters + +These options are used directly on the command line when you start the `trtllm-serve` process. +#### `--tp_size` + + **Description:** Sets the **tensor-parallel size**. This should typically match the number of GPUs you intend to use for a single model instance. + +#### `--ep_size` + + **Description:** Sets the **expert-parallel size** for Mixture-of-Experts (MoE) models. Like `tp_size`, this should generally match the number of GPUs you're using. This setting has no effect on non-MoE models. + +#### `--kv_cache_free_gpu_memory_fraction` + + **Description:** A value between 0.0 and 1.0 that specifies the fraction of free GPU memory to reserve for the KV cache after the model is loaded. Since memory usage can fluctuate, this buffer helps prevent out-of-memory (OOM) errors. + + **Recommendation:** If you experience OOM errors, try reducing this value to **0.7** or lower. + +#### `--backend pytorch` + + **Description:** Tells TensorRT-LLM to use the **pytorch** backend. + +#### `--max_batch_size` + + **Description:** The maximum number of user requests that can be grouped into a single batch for processing. + +#### `--max_num_tokens` + + **Description:** The maximum total number of tokens (across all requests) allowed inside a single scheduled batch. + +#### `--max_seq_len` + + **Description:** The maximum possible sequence length for a single request, including both input and generated output tokens. + +#### `--trust_remote_code` + + **Description:** Allows TensorRT-LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API. + + +#### Extra LLM API Options (YAML Configuration) + +These options provide finer control over performance and are set within a YAML file passed to the trtllm-serve command via the \--extra\_llm\_api\_options argument. + +#### `kv_cache_config` + + **Description**: A section for configuring the Key-Value (KV) cache. + + **Options**: + +  dtype: Sets the data type for the KV cache. + +  **Default**: auto (uses the data type specified in the model checkpoint). + +#### `cuda_graph_config` + + **Description**: A section for configuring CUDA graphs to optimize performance. + + **Options**: + +  enable\_padding: If true, input batches are padded to the nearest cuda\_graph\_batch\_size. This can significantly improve performance. + +  **Default**: false + +  max\_batch\_size: Sets the maximum batch size for which a CUDA graph will be created. + +  **Default**: 0 + +  **Recommendation**: Set this to the same value as the \--max\_batch\_size command-line option. + +  batch\_sizes: A specific list of batch sizes to create CUDA graphs for. + +  **Default**: None + +#### `moe_config` + + **Description**: Configuration for Mixture-of-Experts (MoE) models. + + **Options**: + +  backend: The backend to use for MoE operations. + +  **Default**: CUTLASS + +#### `attention_backend` + + **Description**: The backend to use for attention calculations. + + **Default**: TRTLLM + +See the [TorchLlmArgs class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the extra\_llm\_api\_options`.` + +## Testing API Endpoint + +### Basic Test + +Start a new terminal on the host to test the TensorRT-LLM server you just launched. + +You can query the health/readiness of the server using: + +```shell +curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health" +``` + +When the `Status: 200` code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation. + +After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server. + +```shell +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ + "model": "deepseek-ai/DeepSeek-R1-0528", + "prompt": "Where is New York?", + "max_tokens": 16, + "temperature": 0 +}' +``` + +Here is an example response, showing that the TRT-LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence. + +```json +{"id":"cmpl-e728f08114c042309efeae4df86a50ca","object":"text_completion","created":1754294810,"model":"deepseek-ai/DeepSeek-R1-0528","choices":[{"index":0,"text":" / by Megan Stine ; illustrated by John Hinderliter.\n\nBook | Gross","token_ids":null,"logprobs":null,"context_logits":null,"finish_reason":"length","stop_reason":null,"disaggregated_params":null}],"usage":{"prompt_tokens":6,"total_tokens":22,"completion_tokens":16},"prompt_token_ids":null} +``` + +### Troubleshooting Tips + +* If you encounter CUDA out-of-memory errors, try reducing max\_batch\_size or max\_seq\_len +* Ensure your model checkpoints are compatible with the expected format +* For performance issues, check GPU utilization with nvidia-smi while the server is running +* If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed +* For connection issues, make sure port 8000 is not being used by another application + +### Running Evaluations to Verify Accuracy (Optional) + +We use the lm-eval tool to test the model’s accuracy. For more information see [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness). + +To run the evaluation harness exec into the running TensorRT-LLM container and install with this command: + +```shell +docker exec -it tensorrt_llm /bin/bash + +pip install lm_eval +``` + +FP8 command for GSM8K: + +* Note: The tokenizer will add BOS (beginning of sentence token) before input prompt by default which leads to accuracy regression on GSM8K task for DeepSeek R1 model. So, set add\_special\_tokens=False to avoid it. + +``` +MODEL_PATH=deepseek-ai/DeepSeek-R1-0528 + +lm_eval --model local-completions --tasks gsm8k --batch_size 256 --gen_kwargs temperature=0.0,add_special_tokens=False --num_fewshot 5 --model_args model=${MODEL_PATH},base_url=http://localhost:8000/v1/completions,num_concurrent=32,max_retries=20,tokenized_requests=False --log_samples --output_path trtllm.fp8.gsm8k +``` + +Sample result in Blackwell: + +```shell +|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.9538|± |0.0058| +| | |strict-match | 5|exact_match|↑ |0.9500|± |0.0060| +``` + +FP4 command for GSM8K: + +* Note: The tokenizer will add BOS before input prompt by default, which leads to accuracy regression on GSM8K task for DeepSeek R1 model. So set add\_special\_tokens=False to avoid it. + +```shell +MODEL_PATH=nvidia/DeepSeek-R1-0528-FP4 + +lm_eval --model local-completions --tasks gsm8k --batch_size 256 --gen_kwargs temperature=0.0,add_special_tokens=False --num_fewshot 5 --model_args model=${MODEL_PATH},base_url=http://localhost:8000/v1/completions,num_concurrent=32,max_retries=20,tokenized_requests=False --log_samples --output_path trtllm.fp4.gsm8k +``` + +Sample result in Blackwell: + +```shell +|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.9462|± |0.0062| +| | |strict-match | 5|exact_match|↑ |0.9447|± |0.0063| +``` + +## Benchmarking Performance + +To benchmark the performance of your TensorRT-LLM server you can leverage the built-in “benchmark\_serving.py” script. To do this first creating a wrapper [bench.sh](http://bench.sh) script. + +```shell +cat < bench.sh +concurrency_list="32 64 128 256 512 1024 2048 4096" +multi_round=5 +isl=1024 +osl=1024 +result_dir=/tmp/deepseek_r1_output + +for concurrency in ${concurrency_list}; do + num_prompts=$((concurrency * multi_round)) + python -m tensorrt_llm.serve.scripts.benchmark_serving \ + --model deepseek-ai/DeepSeek-R1-0528 \ + --backend openai \ + --dataset-name "random" \ + --random-input-len ${isl} \ + --random-output-len ${osl} \ + --random-prefix-len 0 \ + --random-ids \ + --num-prompts ${num_prompts} \ + --max-concurrency ${concurrency} \ + --ignore-eos \ + --tokenize-on-client \ + --percentile-metrics "ttft,tpot,itl,e2el" +done +EOF +chmod +x bench.sh +``` + +To benchmark the FP4 model, replace \--model deepseek-ai/DeepSeek-R1-0528 with \--model nvidia/DeepSeek-R1-0528-FP4. + +If you want to save the results to a file add the following options. + +```shell +--save-result \ +--result-dir "${result_dir}" \ +--result-filename "concurrency_${concurrency}.json" +``` + +For more benchmarking options see. [https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt\_llm/serve/scripts/benchmark\_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) + +Run bench.sh to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above bench.sh script. + +```shell +./bench.sh +``` + +Sample TensorRT-LLM serving benchmark output. Your results may vary due to ongoing software optimizations. + +``` +============ Serving Benchmark Result ============ +Successful requests: 16 +Benchmark duration (s): 17.66 +Total input tokens: 16384 +Total generated tokens: 16384 +Request throughput (req/s): [result] +Output token throughput (tok/s): [result] +Total Token throughput (tok/s): [result] +User throughput (tok/s): [result] +---------------Time to First Token---------------- +Mean TTFT (ms): [result] +Median TTFT (ms): [result] +P99 TTFT (ms): [result] +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): [result] +Median TPOT (ms): [result] +P99 TPOT (ms): [result] +---------------Inter-token Latency---------------- +Mean ITL (ms): [result] +Median ITL (ms): [result] +P99 ITL (ms): [result] +----------------End-to-end Latency---------------- +Mean E2EL (ms): [result] +Median E2EL (ms): [result] +P99 E2EL (ms): [result] +================================================== +``` + +### Key Metrics + +* Median Time to First Token (TTFT) + * The typical time elapsed from when a request is sent until the first output token is generated. +* Median Time Per Output Token (TPOT) + * The typical time required to generate each token *after* the first one. +* Median Inter-Token Latency (ITL) + * The typical time delay between the completion of one token and the completion of the next. +* Median End-to-End Latency (E2EL) + * The typical total time from when a request is submitted until the final token of the response is received. +* Total Token Throughput + * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens. From a16ba6445c61ed70e7aadfe787d6f316bb422652 Mon Sep 17 00:00:00 2001 From: chenfeiz0326 Date: Wed, 6 Aug 2025 22:15:24 +0800 Subject: [PATCH 008/186] [None][doc] Create deployment guide for Llama4 Scout FP8 and NVFP4 (#6550) Signed-off-by: Chenfei Zhang Co-authored-by: Tao Li @ NVIDIA --- ...Start Recipe for TRT-LLM + Llama4 Scout.md | 359 ++++++++++++++++++ 1 file changed, 359 insertions(+) create mode 100644 examples/models/core/llama4/Quick Start Recipe for TRT-LLM + Llama4 Scout.md diff --git a/examples/models/core/llama4/Quick Start Recipe for TRT-LLM + Llama4 Scout.md b/examples/models/core/llama4/Quick Start Recipe for TRT-LLM + Llama4 Scout.md new file mode 100644 index 0000000000..e66ffad145 --- /dev/null +++ b/examples/models/core/llama4/Quick Start Recipe for TRT-LLM + Llama4 Scout.md @@ -0,0 +1,359 @@ +# Quick Start Recipe for Llama4 Scout 17B FP8 and NVFP4 + +## Introduction + +This deployment guide provides step-by-step instructions for running the Llama-4-Scout-17B-16E-Instruct model using TensorRT-LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT-LLM parameters, launching the server, and validating inference output. + +The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIA’s accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT-LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution. + +## Access & Licensing + +To use Llama4 Scout 17B, you must first agree to Meta’s Llama 4 Community License ([https://github.com/meta-llama/llama-models/blob/main/models/llama4/LICENSE](https://github.com/meta-llama/llama-models/blob/main/models/llama4/LICENSE)). NVIDIA’s quantized versions (FP8 and NVFP4) are built on top of the base model and are available for research and commercial use under the same license. + +## Prerequisites + +GPU: NVIDIA Blackwell or Hopper Architecture +OS: Linux +Drivers: CUDA Driver 575 or Later +Docker with NVIDIA Container Toolkit installed +Python3 and python3-pip (Optional, for accuracy evaluation only) + +## Models + +* FP8 model: [Llama-4-Scout-17B-16E-Instruct-FP8](https://huggingface.co/nvidia/Llama-4-Scout-17B-16E-Instruct-FP8) +* NVFP4 model: [Llama-4-Scout-17B-16E-Instruct-FP4](https://huggingface.co/nvidia/Llama-4-Scout-17B-16E-Instruct-FP4) + +Note that NVFP4 is only supported on NVIDIA Blackwell platform. + +## Deployment Steps + +### Run Docker Container + +Run the docker container using the TensorRT-LLM NVIDIA NGC image. + +```shell +docker run --rm -it \ +--ipc=host \ +--gpus all \ +-p 8000:8000 \ +-v ~/.cache:/root/.cache:rw \ +--name tensorrt_llm \ +nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc4 \ +/bin/bash +``` + +Note: + +* You can mount additional directories and paths using the `-v :` flag if needed, such as mounting the downloaded weight paths. +* The command mounts your user .cache directory to save the downloaded model checkpoints which are saved to `~/.cache/huggingface/hub/` by default. This prevents having to redownload the weights each time you rerun the container. If the `~/.cache` directory doesn’t exist please create it using mkdir `~/.cache`. +* The command also maps port 8000 from the container to your host so you can access the LLM API endpoint from your host +* See the [https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) for all the available containers. The containers published in the main branch weekly have “rcN” suffix, while the monthly release with QA tests has no “rcN” suffix. Use the rc release to get the latest model and feature support. + +If you want to use latest main branch, you can choose to build from source to install TensorRT-LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) + +### Creating the TRT-LLM Server config + +We create a YAML configuration file /tmp/config.yml for the TensorRT-LLM Server and populate it with the following recommended performance settings. + +```shell +EXTRA_LLM_API_FILE=/tmp/config.yml + +cat << EOF > ${EXTRA_LLM_API_FILE} +enable_attention_dp: false +cuda_graph_config: + enable_padding: true + max_batch_size: 1024 +kv_cache_config: + dtype: fp8 +EOF +``` + +### Launch the TRT-LLM Server + +Below is an example command to launch the TRT-LLM server with the Llama-4-Scout-17B-16E-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section. + +```shell +trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 \ + --host 0.0.0.0 \ + --port 8000 \ + --backend pytorch \ + --max_batch_size 1024 \ + --max_num_tokens 2048 \ + --max_seq_len 2048 \ + --kv_cache_free_gpu_memory_fraction 0.9 \ + --tp_size 1 \ + --ep_size 1 \ + --trust_remote_code \ + --extra_llm_api_options ${EXTRA_LLM_API_FILE} +``` + +After the server is set up, the client can now send prompt requests to the server and receive results. + +### Configs and Parameters + +These options are used directly on the command line when you start the `trtllm-serve` process. +#### `--tp_size` + + **Description:** Sets the **tensor-parallel size**. This should typically match the number of GPUs you intend to use for a single model instance. + +#### `--ep_size` + + **Description:** Sets the **expert-parallel size** for Mixture-of-Experts (MoE) models. Like `tp_size`, this should generally match the number of GPUs you're using. This setting has no effect on non-MoE models. + +#### `--kv_cache_free_gpu_memory_fraction` + + **Description:** A value between 0.0 and 1.0 that specifies the fraction of free GPU memory to reserve for the KV cache after the model is loaded. Since memory usage can fluctuate, this buffer helps prevent out-of-memory (OOM) errors. + + **Recommendation:** If you experience OOM errors, try reducing this value to **0.8** or lower. + +#### `--backend pytorch` + + **Description:** Tells TensorRT-LLM to use the **pytorch** backend. + +#### `--max_batch_size` + + **Description:** The maximum number of user requests that can be grouped into a single batch for processing. + +#### `--max_num_tokens` + + **Description:** The maximum total number of tokens (across all requests) allowed inside a single scheduled batch. + +#### `--max_seq_len` + + **Description:** The maximum possible sequence length for a single request, including both input and generated output tokens. + +#### `--trust_remote_code` + + **Description:** Allows TensorRT-LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API. + + +#### Extra LLM API Options (YAML Configuration) + +These options provide finer control over performance and are set within a YAML file passed to the trtllm-serve command via the `--extra_llm_api_options` argument. + +#### `kv_cache_config` + + **Description**: A section for configuring the Key-Value (KV) cache. + + **Options**: + +  `dtype`: Sets the data type for the KV cache. + +  **Default**: auto (uses the data type specified in the model checkpoint). + +#### `cuda_graph_config` + + **Description**: A section for configuring CUDA graphs to optimize performance. + + **Options**: + +  `enable_padding`: If true, input batches are padded to the nearest `cuda_graph_batch_size`. This can significantly improve performance. + +  **Default**: false + +  `max_batch_size`: Sets the maximum batch size for which a CUDA graph will be created. + +  **Default**: 0 + +  **Recommendation**: Set this to the same value as the `--max_batch_size` command-line option. + +  `batch_sizes`: A specific list of batch sizes to create CUDA graphs for. + +  **Default**: None + +#### `moe_config` + + **Description**: Configuration for Mixture-of-Experts (MoE) models. + + **Options**: + +  `backend`: The backend to use for MoE operations. + +  **Default**: CUTLASS + +#### `attention_backend` + + **Description**: The backend to use for attention calculations. + + **Default**: TRTLLM + +See the [TorchLlmArgs](https://github.com/nvidia/TensorRT-LLM/blob/main/tensorrt_llm/llmapi/llm_args.py#L1980) class for the full list of options which can be used in the `extra_llm_api_options`. + +## Testing API Endpoint + +### Basic Test + +Start a new terminal on the host to test the TensorRT-LLM server you just launched. + +You can query the health/readiness of the server using: + +```shell +curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health" +``` + +When the `Status: 200` code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation. + +After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server. + +```shell +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ + "model": "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", + "prompt": "Where is New York?", + "max_tokens": 16, + "temperature": 0 +}' +``` + +Here is an example response, showing that the TRT-LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence. + +```json +{"id":"cmpl-bc1393d529ce485c961d9ffee5b25d72","object":"text_completion","created":1753843963,"model":"$MODEL","choices":[{"index":0,"text":" New York is a state located in the northeastern United States. It is bordered by","token_ids":null,"logprobs":null,"context_logits":null,"finish_reason":"length","stop_reason":null,"disaggregated_params":null}],"usage":{"prompt_tokens":6,"total_tokens":22,"completion_tokens":16},"prompt_token_ids":null} +``` + +### Troubleshooting Tips + +* If you encounter CUDA out-of-memory errors, try reducing `max_batch_size` or `max_seq_len`. +* Ensure your model checkpoints are compatible with the expected format. +* For performance issues, check GPU utilization with nvidia-smi while the server is running. +* If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed. +* For connection issues, make sure port 8000 is not being used by another application. + +### Running Evaluations to Verify Accuracy (Optional) + +We use the lm-eval tool to test the model’s accuracy. For more information see [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness). + +To run the evaluation harness exec into the running TensorRT-LLM container and install with this command: + +```shell +docker exec -it tensorrt_llm /bin/bash + +pip install lm_eval +``` + +FP8 command for GSM8K + +```shell +MODEL_PATH=nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 + +lm_eval --model local-completions --tasks gsm8k --batch_size 256 --gen_kwargs temperature=0.0 --num_fewshot 5 --model_args model=${MODEL_PATH},base_url=http://localhost:8000/v1/completions,num_concurrent=32,max_retries=20,tokenized_requests=False --log_samples --output_path trtllm.fp8.gsm8k +``` + +Sample result in Blackwell. + +```shell +|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.9189|± |0.0075| +| | |strict-match | 5|exact_match|↑ |0.8984|± |0.0083| +``` + +FP4 command for GSM8K + +```shell +MODEL_PATH=nvidia/Llama-4-Scout-17B-16E-Instruct-FP4 + +lm_eval --model local-completions --tasks gsm8k --batch_size 256 --gen_kwargs temperature=0.0 --num_fewshot 5 --model_args model=${MODEL_PATH},base_url=http://localhost:8000/v1/completions,num_concurrent=32,max_retries=20,tokenized_requests=False --log_samples --output_path trtllm.fp4.gsm8k +``` + +Sample result in Blackwell + +```shell +|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.9075|± |0.0080| +| | |strict-match | 5|exact_match|↑ |0.8908|± |0.0086| +``` + +## Benchmarking Performance + +To benchmark the performance of your TensorRT-LLM server you can leverage the built-in `benchmark_serving.py` script. To do this first creating a wrapper [bench.sh](http://bench.sh) script. + +```shell +cat < bench.sh +concurrency_list="1 2 4 8 16 32 64 128 256" +multi_round=5 +isl=1024 +osl=1024 +result_dir=/tmp/llama4_output + +for concurrency in ${concurrency_list}; do + num_prompts=$((concurrency * multi_round)) + python -m tensorrt_llm.serve.scripts.benchmark_serving \ + --model nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 \ + --backend openai \ + --dataset-name "random" \ + --random-input-len ${isl} \ + --random-output-len ${osl} \ + --random-prefix-len 0 \ + --random-ids \ + --num-prompts ${num_prompts} \ + --max-concurrency ${concurrency} \ + --ignore-eos \ + --tokenize-on-client \ + --percentile-metrics "ttft,tpot,itl,e2el" +done +EOF +chmod +x bench.sh +``` + +To benchmark the FP4 model, replace `--model nvidia/Llama-4-Scout-17B-16E-Instruct-FP8` with `--model nvidia/Llama-4-Scout-17B-16E-Instruct-FP4`. + +If you want to save the results to a file add the following options. + +```shell +--save-result \ +--result-dir "${result_dir}" \ +--result-filename "concurrency_${concurrency}.json" +``` + +For more benchmarking options see. [https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt\_llm/serve/scripts/benchmark\_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) + +Run bench.sh to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above bench.sh script. + +```shell +./bench.sh +``` + +Sample TensorRT-LLM serving benchmark output. Your results may vary due to ongoing software optimizations. + +``` +============ Serving Benchmark Result ============ +Successful requests: 16 +Benchmark duration (s): 17.66 +Total input tokens: 16384 +Total generated tokens: 16384 +Request throughput (req/s): [result] +Output token throughput (tok/s): [result] +Total Token throughput (tok/s): [result] +User throughput (tok/s): [result] +---------------Time to First Token---------------- +Mean TTFT (ms): [result] +Median TTFT (ms): [result] +P99 TTFT (ms): [result] +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): [result] +Median TPOT (ms): [result] +P99 TPOT (ms): [result] +---------------Inter-token Latency---------------- +Mean ITL (ms): [result] +Median ITL (ms): [result] +P99 ITL (ms): [result] +----------------End-to-end Latency---------------- +Mean E2EL (ms): [result] +Median E2EL (ms): [result] +P99 E2EL (ms): [result] +================================================== +``` + +### Key Metrics + +* Median Time to First Token (TTFT) + * The typical time elapsed from when a request is sent until the first output token is generated. +* Median Time Per Output Token (TPOT) + * The typical time required to generate each token *after* the first one. +* Median Inter-Token Latency (ITL) + * The typical time delay between the completion of one token and the completion of the next. +* Median End-to-End Latency (E2EL) + * The typical total time from when a request is submitted until the final token of the response is received. +* Total Token Throughput + * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens. From 7e0158b583344eae9e1760f7fdb24a83b8ea5a92 Mon Sep 17 00:00:00 2001 From: Izzy Putterman Date: Wed, 6 Aug 2025 14:05:18 -0700 Subject: [PATCH 009/186] Qwen3: Fix eagle hidden states (#6199) Signed-off-by: Izzy Putterman --- tensorrt_llm/_torch/models/modeling_qwen3_moe.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py index 2d447dd527..eeefecb422 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py @@ -214,7 +214,9 @@ class Qwen3MoEDecoderLayer(DecoderLayer): if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) - + if spec_metadata is not None and spec_metadata.is_layer_capture( + self.layer_idx): + self.fusion_config.POST_MOE_FUSION = False # Self Attention hidden_states = self.self_attn( position_ids=position_ids, @@ -257,9 +259,6 @@ class Qwen3MoEDecoderLayer(DecoderLayer): if self.fusion_config.POST_MOE_FUSION: if do_finalize: - if spec_metadata: - spec_metadata.maybe_capture_hidden_states( - self.layer_idx, hidden_states, residual) hidden_states, residual = self.allreduce( hidden_states, all_reduce_params=AllReduceParams( @@ -289,12 +288,8 @@ class Qwen3MoEDecoderLayer(DecoderLayer): hidden_states, residual = self.moe_allreduce( fc2_output, all_reduce_params=moe_all_reduce_params) - if spec_metadata: - spec_metadata.maybe_capture_hidden_states( - self.layer_idx, hidden_states, residual) - else: - if spec_metadata: + if spec_metadata and spec_metadata.is_layer_capture(self.layer_idx): spec_metadata.maybe_capture_hidden_states( self.layer_idx, hidden_states, residual) if self.next_layer_layernorm is not None: From 2a946859a79b53cf1e58d5f852ecf41597ad5d5e Mon Sep 17 00:00:00 2001 From: Yibin Li <109242046+yibinl-nvidia@users.noreply.github.com> Date: Wed, 6 Aug 2025 14:21:03 -0700 Subject: [PATCH 010/186] [None][fix] Upgrade dependencies version to avoid security vulnerability (#6506) Signed-off-by: Yibin Li <109242046+yibinl-nvidia@users.noreply.github.com> --- examples/models/core/mixtral/requirements.txt | 2 +- examples/models/core/qwen/requirements.txt | 2 +- requirements.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/models/core/mixtral/requirements.txt b/examples/models/core/mixtral/requirements.txt index 50164ee5d3..fee4da9cf6 100644 --- a/examples/models/core/mixtral/requirements.txt +++ b/examples/models/core/mixtral/requirements.txt @@ -1,4 +1,4 @@ -c ../../../constraints.txt tensorrt_llm>=0.0.0.dev0 -transformers==4.38.2 +transformers==4.54.0 accelerate==0.25.0 diff --git a/examples/models/core/qwen/requirements.txt b/examples/models/core/qwen/requirements.txt index 397e53956d..64ada0fdb3 100644 --- a/examples/models/core/qwen/requirements.txt +++ b/examples/models/core/qwen/requirements.txt @@ -10,7 +10,7 @@ tiktoken einops # optional dependencies -gradio==4.36.0 +gradio==4.44.1 mdtex2html sse_starlette aiohttp_sse_client diff --git a/requirements.txt b/requirements.txt index 2ecd9ef5c4..c928e9836b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ diffusers>=0.27.0 lark mpi4py numpy<2 -onnx>=1.12.0 +onnx>=1.18.0 onnx_graphsurgeon>=0.5.2 openai polygraphy From f30398470de651e94ac93958267ecf020ef5e227 Mon Sep 17 00:00:00 2001 From: ruodil <200874449+ruodil@users.noreply.github.com> Date: Thu, 7 Aug 2025 08:00:45 +0800 Subject: [PATCH 011/186] [None][chore] update readme for perf release test (#6664) Signed-off-by: ruodil <200874449+ruodil@users.noreply.github.com> Co-authored-by: Larry <197874197+LarryXFly@users.noreply.github.com> --- .../defs/perf/README_release_test.md | 31 +++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/tests/integration/defs/perf/README_release_test.md b/tests/integration/defs/perf/README_release_test.md index 7bff0ed37d..2fe42147c7 100644 --- a/tests/integration/defs/perf/README_release_test.md +++ b/tests/integration/defs/perf/README_release_test.md @@ -111,15 +111,40 @@ if self._config.backend == "pytorch": ### 3.1 Full Test Cycles -1. **trt_llm_release_perf_test.yml** - Release performance test -2. **trt_llm_perf_cluster_test.yml** - Cluster performance test +1. **llm_perf_full.yml** - Release performance test + - [test_lists/qa/llm_perf_full.yml](../../test_lists/qa/llm_perf_full.yml) +2. **llm_perf_cluster.yml** - Cluster performance test(for Blackwell) + - [test_lists/qa/llm_perf_cluster.yml](../../test_lists/qa/llm_perf_cluster.yml) +3. **llm_perf_nim.yml** - NIM performance test + - [test_lists/qa/llm_perf_nim.yml](../../test_lists/qa/llm_perf_nim.yml) ### 3.2 Sanity Test Cycles -- **trt_llm_release_perf_sanity.yml** - Release performance sanity test +- **llm_perf_sanity.yml** - Release performance sanity test + - [test_lists/qa/llm_perf_sanity.yml](../../test_lists/qa/llm_perf_sanity.yml) ## 4. Test Configuration Description +### 4.1 PyTorch Model Configuration + +The default PyTorch configuration is defined in [pytorch_model_config.py](pytorch_model_config.py) and can be overridden for specific test patterns. For example: + +```python +{ + 'patterns': [ + 'qwen3_235b_a22b_fp4-bench-pytorch-float4-maxbs:512-maxnt:2048-input_output_len:1000,2000-con:8-ep:8-gpus:8', + ], + 'config': { + 'enable_attention_dp': False, + 'moe_config': { + 'backend': 'TRTLLM' + } + } +} +``` + +This configuration allows you to customize PyTorch-specific settings for different model patterns while maintaining the base configuration as a fallback. + ### 4.1 Test Case Configuration - Test cases are defined in YAML configuration files - Support for different models, precisions, batch sizes, etc. From 780d7507f9ce1f954b92cffe824bf543440898a8 Mon Sep 17 00:00:00 2001 From: ruodil <200874449+ruodil@users.noreply.github.com> Date: Thu, 7 Aug 2025 08:02:13 +0800 Subject: [PATCH 012/186] [None][test] remove trt backend cases in release perf test and move NIM cases to llm_perf_nim.yml (#6662) Signed-off-by: ruodil <200874449+ruodil@users.noreply.github.com> Co-authored-by: Larry <197874197+LarryXFly@users.noreply.github.com> --- .../test_lists/qa/llm_perf_full.yml | 255 ++---------- .../test_lists/qa/llm_perf_nim.yml | 367 +++++++++++++++++- .../test_lists/qa/llm_perf_sanity.yml | 91 +---- 3 files changed, 400 insertions(+), 313 deletions(-) diff --git a/tests/integration/test_lists/qa/llm_perf_full.yml b/tests/integration/test_lists/qa/llm_perf_full.yml index c4778586b5..6573c2dd9a 100644 --- a/tests/integration/test_lists/qa/llm_perf_full.yml +++ b/tests/integration/test_lists/qa/llm_perf_full.yml @@ -14,29 +14,16 @@ trt_llm_release_perf_test: - '*l20*' - '*h20*' tests: - # E2E BERT - - perf/test_perf.py::test_perf[bert_large-cpp-plugin-float16-bs:32+64-input_len:128+512] - - perf/test_perf.py::test_perf[roberta_base-cpp-plugin-float16-bs:32+64-input_len:128+512] - - # E2E gptManagerBenchmark IFB - # E2E ENC-DEC - - perf/test_perf.py::test_perf[t5_large-cppmanager-exe-plugin_ifb-float16-mp-input_output_len:128,20] - # E2E trtllm-bench #llama_v3.1_8b_instruct - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:512,32] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32] #pytorch backend - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128] - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:512,32] - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[qwen2_7b_instruct-bench-float16-input_output_len:128,128] + - perf/test_perf.py::test_perf[qwen2_7b_instruct-bench-pytorch-float16-input_output_len:128,128] - perf/test_perf.py::test_perf[starcoder2_3b-bench-pytorch-float16-input_output_len:512,200] - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:128,128] + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-pytorch-float16-input_output_len:128,128] # Ministral-8B - perf/test_perf.py::test_perf[ministral_8b-bench-pytorch-bfloat16-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] @@ -47,26 +34,6 @@ trt_llm_release_perf_test: # Ministral-8B LoRA tests (using dummy Mistral LoRA checkpoint) - perf/test_perf.py::test_perf[ministral_8b-bench-pytorch-bfloat16-maxbs:2-maxnt:1024-input_output_len:128,128-loras:1-reqs:8-con:2] - # E2E ENC-DEC - - perf/test_perf.py::test_perf[bart_large_cnn-cppmanager-exe-plugin_ifb-float16-input_output_len:128,20] - - perf/test_perf.py::test_perf[mbart_large_50_many_to_one_mmt-cppmanager-exe-plugin_ifb-float16-input_output_len:128,20] - - perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20] - - perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20] - - perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20] - - perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20] - - perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:128,128] - - perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:512,32] - - perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:128,128] - - perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:512,32] - - # Phi-4-mini-instruct - # cpp - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-con:250] - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-con:250] - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-con:250] - # reduced 'reqs' to fit timeout limit - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-reqs:8-con:1] # Phi-4-multimodal-instruct - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-con:250] - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] @@ -98,48 +65,7 @@ trt_llm_release_perf_test: tests: - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_image-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250] - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_audio-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250] - - perf/test_perf.py::test_perf[llama_v3_8b_instruct-cppmanager-exe-static_batching-plugin_ifb-float16-bs:8+64-input_output_len:128,128+512,32] - - perf/test_perf.py::test_perf[llama_v3_8b_instruct-cppmanager-exe-plugin_ifb-float16-mp-input_output_len:128,128+512,32] - - perf/test_perf.py::test_perf[llama_v3_8b_instruct-cppmanager-exe-plugin_ifb-bfloat16-gwp:0.0-input_output_len:128,128+512,32] - - perf/test_perf.py::test_perf[llama_v3_8b_instruct-cppmanager-exe-plugin_ifb-bfloat16-gwp:0.5-input_output_len:128,128+512,32] - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-reqs:10-con:1] - # Llama-3.1-Nemotron-Nano-8B-v1 - # cpp backend - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-quant:fp8-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-con:250] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-con:250] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-con:250] - # pyt backend - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:500,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:500,2000-reqs:500-con:250] - # FP8 prequantized pyt backend - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:500,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:500-con:250] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:500,2000-reqs:500-con:250] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:500-con:250] - #long time llama_nemotron cases - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] # timeout for l20, l40s, a100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1] #timeout for l20, l40s, failed for a100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-quant:fp8-reqs:8-con:1] # timeout for l20, l40s, failed on a100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-con:250] # failed for a100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-con:250] # failed on A100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-quant:fp8-con:250] # failed on A100 15 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-con:250] # timeout for l20, l40s, a100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-quant:fp8-con:250] # timeout for l20, l40s, failed on A100 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:500-con:250] # failed for l20, need to extend context token to 5000 for l40s and a100, timeout for h20 - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-reqs:500-con:250] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:500-con:250] #need to extend context token to 20000 for l40s, timeout for h20, a100 # deepseek_v3_lite_fp8 - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:128,128] - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:2000,500] @@ -158,12 +84,6 @@ trt_llm_release_perf_test: - '*l20*' - '*h20*' tests: - #llama_v3.1_8b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_8b-cppmanager-exe-plugin_ifb-float16-maxbs:256-input_output_len:128,128-beams:4-quant:fp8] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:w4a16_awq] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:w4a8_awq] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-quant:fp8] #pytorch backend - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32] @@ -171,14 +91,7 @@ trt_llm_release_perf_test: - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-loras:1-reqs:100-con:2-gpus:1] - #mistral_7b_v0.1 - #trt backend - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-maxbs:256-input_output_len:1000,1000-quant:fp8] - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-maxbs:256-input_output_len:500,2000-quant:fp8] - #phi_3_mini_4k_instruct - #trt backend - - perf/test_perf.py::test_perf[phi_3_mini_4k_instruct-bench-float16-maxbs:128-input_output_len:1000,1000-quant:fp8] - - perf/test_perf.py::test_perf[phi_3_mini_4k_instruct-bench-float16-maxbs:64-input_output_len:500,2000-quant:fp8] + - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250] - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:2000,2000-con:250] @@ -188,18 +101,6 @@ trt_llm_release_perf_test: - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-maxnt:5000-input_output_len:5000,500-reqs:500-con:250] - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-input_output_len:500,2000-reqs:500-con:250] -- condition: - terms: - supports_fp8: true - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*h20*' - tests: - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:10-con:1] - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:10-con:250] - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-reqs:10-con:250] # 2 gpus test - condition: @@ -216,21 +117,13 @@ trt_llm_release_perf_test: - '*h20*' tests: #llama_v3.1_8b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_8b-cppmanager-exe-plugin_ifb-bfloat16-mp-maxbs:256-input_output_len:128,128-pp:2] - #pytorch backend + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-mp-maxbs:256-input_output_len:128,128-pp:2] - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2] - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2] #mixtral_8x7b_v0.1 - #trt backend - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-loras:8-gpus:2] #pytorch backend - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-loras:8-gpus:2] #llama_v3.2_1b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxnt:5000-input_output_len:5000,500-reqs:10-con:1-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxnt:5000-input_output_len:5000,500-reqs:10-con:250-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-gpus:2] #pytorch backend - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:2000,500-reqs:10-con:1-gpus:2] - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:2] @@ -238,9 +131,6 @@ trt_llm_release_perf_test: - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:512,32-gpus:2] - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:512,200-gpus:2] - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:500,2000-reqs:10-con:1-gpus:2] - #t5 - - perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2] - - perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20-gpus:2] - condition: ranges: @@ -255,14 +145,6 @@ trt_llm_release_perf_test: - '*a100*' - '*h20*' tests: - #llama_v3.1_70b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:1024,1024-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_70b_sq_per_tensor-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-gpus:2] - #mixtral_8x7b_v0.1 - #trt backend - - perf/test_perf.py::test_perf[mixtral_8x7b-cppmanager-exe-plugin_ifb-float16-mp-input_output_len:128,128+512,32-gpus:2] - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-gpus:2] #pytorch backend - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-gpus:2] - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-streaming-float16-input_output_len:128,128-gpus:2] @@ -282,23 +164,13 @@ trt_llm_release_perf_test: - '*l20*' - '*h20*' tests: - #llama_v3.2_1b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-quant:fp8-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-quant:fp8-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:512,32-quant:fp8-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:512,200-quant:fp8-gpus:2] #mixtral_8x7b_v0.1_fp8 pytorch backend - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2] - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:2] - #mistral_7b_v0.1 - #trt backend - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8-tp:2] - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8-tp:2] #phi_3_mini_128k_instruct - #trt backend - - perf/test_perf.py::test_perf[phi_3_mini_128k_instruct-bench-float16-maxbs:128-input_output_len:1000,1000-quant:fp8-tp:2] - - perf/test_perf.py::test_perf[phi_3_mini_128k_instruct-bench-float16-maxbs:128-input_output_len:500,2000-quant:fp8-tp:2] + #pytorch backend + - perf/test_perf.py::test_perf[phi_3_mini_128k_instruct-bench-pytorch-float16-maxbs:128-input_output_len:1000,1000-quant:fp8-tp:2] + - perf/test_perf.py::test_perf[phi_3_mini_128k_instruct-bench-pytorch-float16-maxbs:128-input_output_len:500,2000-quant:fp8-tp:2] - condition: terms: @@ -314,15 +186,10 @@ trt_llm_release_perf_test: - '*h200*' - '*h20*' tests: - #mixtral_8x7b_v0.1 - #trt backend - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2] - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:512,32-quant:fp8-gpus:2] #pytorch backend - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2] - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:2] - #llama_v3.2_1b trt backend - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:500,2000-quant:fp8-con:250-gpus:2] + # 4 gpus test - condition: @@ -338,18 +205,12 @@ trt_llm_release_perf_test: - '*h20*' tests: - - perf/test_perf.py::test_perf[flan_t5_xxl-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128-gpus:4] - - perf/test_perf.py::test_perf[flan_t5_xxl-cppmanager-exe-plugin_ifb-float16-input_output_len:512,32-gpus:4] - - perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128-gpus:4] - - perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-ootb_except_mha-float16-input_output_len:128,128+512,32-gpus:4] - - perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-exe-plugin_ifb-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4] - - perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-ootb_except_mha-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4] #llama_v3.1_70b #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-streaming-bfloat16-input_output_len:128,128-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-streaming-bfloat16-input_output_len:512,32-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:128,128-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,32-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-streaming-bfloat16-input_output_len:512,32-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:512,32-gpus:4] # FP8 specific tests - condition: @@ -365,10 +226,6 @@ trt_llm_release_perf_test: - '*l40s*' - '*h20*' tests: - #llama_v3.1_70b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,200-quant:fp8-tp:4] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-tp:4] #llama_v3.3_70b_instruct_fp8 #pytorch backend - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000-gpus:4] @@ -376,26 +233,6 @@ trt_llm_release_perf_test: - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:4] - # Llama-Nemotron-Super-49B-v3.3 - # cpp - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-reqs:4-con:1-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:4-con:1-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-reqs:4-con:1-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:4-con:1-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-con:250-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-con:250-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-con:250-gpus:4] - # pyt - # bfloat16 - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] - # fp8 prequantized - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:4-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] - condition: @@ -411,15 +248,7 @@ trt_llm_release_perf_test: - '*a100*' - '*h20*' tests: - # E2E trtllm-bench #llama_v3.1_70b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-cppmanager-exe-plugin_ifb-float16-input_output_len:200,2000-reqs:64-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:200,2000-reqs:64-con:200-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:200,2000-reqs:8-con:1-gpus:8] # timeout for h20, move to l2 test - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:2000,200-reqs:64-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-input_output_len:128,128-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-reqs:64-con:250-gpus:8] #pytorch backend - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,200-reqs:64-gpus:8] - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:2000,200-reqs:64-gpus:8] @@ -427,8 +256,6 @@ trt_llm_release_perf_test: - perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8] - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-bfloat16-input_output_len:200,2000-reqs:64-con:200-gpus:8] - - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8] - - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] - condition: ranges: @@ -444,25 +271,10 @@ trt_llm_release_perf_test: tests: # E2E trtllm-bench #mixtral_8x7b_v0.1_instruct - #trt backend - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:64-gpus:8] # timeout for a100 - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:10-con:50-gpus:8] # timeout for a100 - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:10-con:1-gpus:8] # timeout for a100 #pytorch backend - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:64-gpus:8] # timeout for a100 - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:10-con:50-gpus:8] # timeout for a100 - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:10-con:1-gpus:8] # timeout for a100 - # Llama-3_1-Nemotron-Ultra-253B-v1 - # all cpp backend, bf16->fp8 post-quantized - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:8-con:1-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:8-con:1-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:250-con:250-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:250-con:250-tp:8-gpus:8] - # pyt backend, fp8 pre-quantized - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:8-gpus:8] # llama_v3.1_405b_fp8 #pytorch backend - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-maxbs:1-input_output_len:2000,500-reqs:8-con:1-tp:8-gpus:8] @@ -494,33 +306,25 @@ trt_llm_release_perf_test: - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-kv_frac:0.85-input_output_len:128,128-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-kv_frac:0.85-input_output_len:512,32-ep:8-tp:8-gpus:8] - #deepseek_r1_fp8 + #llama_v4_scout_17b_16e_instruct #pytorch backend - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8] - - -- condition: - ranges: - system_gpu_count: - gte: 8 - gpu_memory: - gt: 80000 - wildcards: - gpu: - - '*h200*' - - '*h20*' - tests: - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:32-input_output_len:128,128-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-streaming-float8-maxbs:32-input_output_len:128,128-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-con:1-ep:4-tp:8-gpus:8] TIMEOUT(40)#min latency test - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:128-maxnt:1127-input_output_len:1000,2000-reqs:5120-con:1024-ep:8-tp:8-gpus:8] TIMEOUT(80) #max throughput test - - perf/test_perf.py::test_perf[deepseek_r1_0528_fp8-bench-pytorch-float8-input_output_len:1000,1000-reqs:20000-ep:8-tp:8-gpus:8] TIMEOUT(120) - - perf/test_perf.py::test_perf[deepseek_r1_0528_fp8-bench-pytorch-float8-input_output_len:1000,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(100) - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-bfloat16-input_output_len:128,128-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(45) + #deepseek_r1_fp8 + #pytorch backend + - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:32-input_output_len:128,128-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-streaming-float8-maxbs:32-input_output_len:128,128-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-con:1-ep:4-tp:8-gpus:8] #min latency test + - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:128-maxnt:1127-input_output_len:1000,2000-reqs:5120-con:1024-ep:8-tp:8-gpus:8] TIMEOUT(80) #max throughput test + - perf/test_perf.py::test_perf[deepseek_r1_0528_fp8-bench-pytorch-float8-input_output_len:1000,1000-reqs:20000-ep:8-tp:8-gpus:8] TIMEOUT(120) + - perf/test_perf.py::test_perf[deepseek_r1_0528_fp8-bench-pytorch-float8-input_output_len:1000,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(100) + # qwen3_235b_a22b_fp8 + - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(45) + # FP8 specific tests - condition: @@ -537,14 +341,6 @@ trt_llm_release_perf_test: - '*h20*' tests: #llama_v3.3_70b_instruct_fp8 - #trt backend - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:512,32-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-streaming-float8-maxbs:16-input_output_len:512,32-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:64-con:250-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-reqs:8-con:1-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-reqs:64-con:250-gpus:8] #pytorch backend - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:512,32-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:8] @@ -553,7 +349,6 @@ trt_llm_release_perf_test: - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8] - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,200-gpus:8] - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:8] # GB chip specific tests diff --git a/tests/integration/test_lists/qa/llm_perf_nim.yml b/tests/integration/test_lists/qa/llm_perf_nim.yml index d89e854378..7887366953 100644 --- a/tests/integration/test_lists/qa/llm_perf_nim.yml +++ b/tests/integration/test_lists/qa/llm_perf_nim.yml @@ -1,5 +1,316 @@ version: 0.0.1 -trt_llm_release_perf_l2_test: +llm_perf_nim: +# one gpu test +- condition: + ranges: + system_gpu_count: + gte: 1 + wildcards: + gpu: + - '*h100*' + - '*h200*' + - '*a100*' + - '*l40s*' + - '*l20*' + - '*h20*' + tests: + # E2E trtllm-bench + #llama_v3.1_8b_instruct + #trt backend + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:512,32] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32] + # Mistral-7B + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:128,128] + # Phi-4-mini-instruct + # cpp + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-con:250] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-con:250] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-con:250] + # reduced 'reqs' to fit timeout limit + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-reqs:8-con:1] + +- condition: + ranges: + system_gpu_count: + gte: 1 + wildcards: + gpu: + - '*h100*' + - '*h200*' + - '*h20*' + tests: + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-reqs:10-con:1] + # Llama-3.1-Nemotron-Nano-8B-v1 + # cpp backend + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-quant:fp8-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-con:250] + # pyt backend + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:500,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:500,2000-reqs:500-con:250] + # FP8 prequantized pyt backend + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:500,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:500-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:500,2000-reqs:500-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:500-con:250] + #long time llama_nemotron cases + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] # timeout for l20, l40s, a100 + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1] #timeout for l20, l40s, failed for a100 + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-quant:fp8-reqs:8-con:1] # timeout for l20, l40s, failed on a100 + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-con:250] # failed for a100 + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-con:250] # failed on A100 + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:1000,1000-quant:fp8-con:250] # failed on A100 15 + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-con:250] # timeout for l20, l40s, a100 + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-maxnt:20000-input_output_len:20000,2000-quant:fp8-con:250] # timeout for l20, l40s, failed on A100 + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:500-con:250] # failed for l20, need to extend context token to 5000 for l40s and a100, timeout for h20 + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-reqs:500-con:250] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:500-con:250] #need to extend context token to 20000 for l40s, timeout for h20, a100 + +# FP8 specific tests +- condition: + terms: + supports_fp8: true + wildcards: + gpu: + - '*h100*' + - '*h200*' + - '*l40s*' + - '*l20*' + - '*h20*' + - '*b200*' + - '*gb200*' + tests: + #llama_v3.1_8b + #trt backend + - perf/test_perf.py::test_perf[llama_v3.1_8b-cppmanager-exe-plugin_ifb-float16-maxbs:256-input_output_len:128,128-beams:4-quant:fp8] + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:w4a16_awq] + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:w4a8_awq] + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-quant:fp8] + #mistral_7b_v0.1 + #trt backend + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-maxbs:256-input_output_len:1000,1000-quant:fp8] + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-maxbs:256-input_output_len:500,2000-quant:fp8] + #phi_3_mini_4k_instruct + #trt backend + - perf/test_perf.py::test_perf[phi_3_mini_4k_instruct-bench-float16-maxbs:128-input_output_len:1000,1000-quant:fp8] + - perf/test_perf.py::test_perf[phi_3_mini_4k_instruct-bench-float16-maxbs:64-input_output_len:500,2000-quant:fp8] + +- condition: + terms: + supports_fp8: true + wildcards: + gpu: + - '*h100*' + - '*h200*' + - '*h20*' + tests: + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:10-con:1] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:10-con:250] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-input_output_len:500,2000-quant:fp8-reqs:10-con:250] + + +# 2 gpus test +- condition: + ranges: + system_gpu_count: + gte: 2 + wildcards: + gpu: + - '*h100*' + - '*h200*' + - '*a100*' + - '*l40s*' + - '*l20*' + - '*h20*' + tests: + #llama_v3.1_8b + #trt backend + - perf/test_perf.py::test_perf[llama_v3.1_8b-cppmanager-exe-plugin_ifb-bfloat16-mp-maxbs:256-input_output_len:128,128-pp:2] + #mixtral_8x7b_v0.1 + #trt backend + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-loras:8-gpus:2] + #llama_v3.2_1b + #trt backend + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxnt:5000-input_output_len:5000,500-reqs:10-con:1-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxnt:5000-input_output_len:5000,500-reqs:10-con:250-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-gpus:2] + #t5 + - perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2] + - perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20-gpus:2] + +- condition: + ranges: + system_gpu_count: + gte: 2 + gpu_memory: + gt: 80000 + wildcards: + gpu: + - '*h100*' + - '*h200*' + - '*a100*' + - '*h20*' + tests: + #llama_v3.1_70b + #trt backend + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:1024,1024-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_70b_sq_per_tensor-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-gpus:2] + #mixtral_8x7b_v0.1 + #trt backend + - perf/test_perf.py::test_perf[mixtral_8x7b-cppmanager-exe-plugin_ifb-float16-mp-input_output_len:128,128+512,32-gpus:2] + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-gpus:2] + +# FP8 specific tests +- condition: + terms: + supports_fp8: true + ranges: + system_gpu_count: + gte: 2 + wildcards: + gpu: + - '*h100*' + - '*h200*' + - '*l40s*' + - '*l20*' + - '*h20*' + tests: + #llama_v3.2_1b + #trt backend + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-quant:fp8-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-quant:fp8-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:512,32-quant:fp8-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:512,200-quant:fp8-gpus:2] + #mistral_7b_v0.1 + #trt backend + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8-tp:2] + - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8-tp:2] + #phi_3_mini_128k_instruct + #trt backend + - perf/test_perf.py::test_perf[phi_3_mini_128k_instruct-bench-float16-maxbs:128-input_output_len:1000,1000-quant:fp8-tp:2] + - perf/test_perf.py::test_perf[phi_3_mini_128k_instruct-bench-float16-maxbs:128-input_output_len:500,2000-quant:fp8-tp:2] + +- condition: + terms: + supports_fp8: true + ranges: + system_gpu_count: + gte: 2 + gpu_memory: + gt: 80000 + wildcards: + gpu: + - '*h100*' + - '*h200*' + - '*h20*' + tests: + #mixtral_8x7b_v0.1 + #trt backend + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2] + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:512,32-quant:fp8-gpus:2] + #llama_v3.2_1b trt backend + - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:500,2000-quant:fp8-con:250-gpus:2] + +# 4 gpus test +- condition: + ranges: + system_gpu_count: + gte: 4 + wildcards: + gpu: + - '*h100*' + - '*h200*' + - '*a100*' + - '*l40s*' + - '*h20*' + tests: + - perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-exe-plugin_ifb-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4] + - perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-ootb_except_mha-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4] + #llama_v3.1_70b + #trt backend + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-streaming-bfloat16-input_output_len:128,128-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-streaming-bfloat16-input_output_len:512,32-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:128,128-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,32-gpus:4] + +# FP8 specific tests +- condition: + terms: + supports_fp8: true + ranges: + system_gpu_count: + gte: 4 + wildcards: + gpu: + - '*b200*' + - '*gb200*' + - '*h100*' + - '*h200*' + - '*l40s*' + - '*h20*' + tests: + #llama_v3.1_70b + #trt backend + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,200-quant:fp8-tp:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-tp:4] + # Llama-Nemotron-Super-49B-v3.3 + # cpp + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-reqs:4-con:1-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:4-con:1-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-reqs:4-con:1-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:4-con:1-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-con:250-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-con:250-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-con:250-gpus:4] + # pyt + # bfloat16 + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] + # fp8 prequantized + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:4-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] + +- condition: + ranges: + system_gpu_count: + gte: 8 + gpu_memory: + gt: 80000 + wildcards: + gpu: + - '*h100*' + - '*h200*' + - '*a100*' + - '*h20*' + tests: + # E2E trtllm-bench + #llama_v3.1_70b + #trt backend + - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-cppmanager-exe-plugin_ifb-float16-input_output_len:200,2000-reqs:64-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:200,2000-reqs:64-con:200-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:200,2000-reqs:8-con:1-gpus:8] # timeout for h20, move to l2 test + - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-bfloat16-input_output_len:2000,200-reqs:64-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-input_output_len:128,128-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-reqs:64-con:250-gpus:8] + - condition: ranges: system_gpu_count: @@ -8,9 +319,27 @@ trt_llm_release_perf_l2_test: gt: 100000 wildcards: gpu: + - '*h100*' - '*h200*' - '*h20*' tests: + #mixtral_8x7b_v0.1_instruct + #trt backend + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:64-gpus:8] # timeout for a100 + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:10-con:50-gpus:8] # timeout for a100 + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-float16-input_output_len:128,128-reqs:10-con:1-gpus:8] # timeout for a100 + # Llama-3_1-Nemotron-Ultra-253B-v1 + # all cpp backend, bf16->fp8 post-quantized + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:8-con:1-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:8-con:1-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:250-con:250-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:250-con:250-tp:8-gpus:8] + # pyt backend, fp8 pre-quantized + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:8-gpus:8] + #deepseek_r1_fp8 - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-con:1-ep:4-tp:8-gpus:8] #min latency test - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:128-maxnt:1127-input_output_len:1000,2000-reqs:5120-con:1024-ep:8-tp:8-gpus:8] #max throughput test - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:500-con:250] @@ -30,14 +359,28 @@ trt_llm_release_perf_l2_test: - '*h20*' tests: - perf/test_perf.py::test_perf[mixtral_8x22b_v0.1-bench-float16-input_output_len:512,512-quant:fp8-tp:4] # timeout for h100 - # Llama-3.3-Nemotron-Super-49B-v1 - # trt backend - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:5000,500-reqs:4-con:1-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:4-con:1-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-reqs:4-con:1-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:4-con:1-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:5000,500-con:250-gpus:4] # timeout for h100 - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:5000,500-quant:fp8-con:250-gpus:4] # timeout for h100 - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-con:250-gpus:4] # timeout for h100 - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-con:250-gpus:4] # timeout for h100 - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-streaming-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-con:250-gpus:4] # timeout for h100 + #llama_v3.3_70b_instruct_fp8 + # FP8 specific tests +- condition: + terms: + supports_fp8: true + ranges: + system_gpu_count: + gte: 8 + wildcards: + gpu: + - '*b200*' + - '*h100*' + - '*h200*' + - '*l40s*' + - '*h20*' + tests: + #llama_v3.3_70b_instruct_fp8 + #trt backend + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:512,32-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-streaming-float8-maxbs:16-input_output_len:512,32-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:8-con:1-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:64-con:250-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-reqs:8-con:1-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-reqs:64-con:250-gpus:8] diff --git a/tests/integration/test_lists/qa/llm_perf_sanity.yml b/tests/integration/test_lists/qa/llm_perf_sanity.yml index 2853c656a8..bfde42d04c 100644 --- a/tests/integration/test_lists/qa/llm_perf_sanity.yml +++ b/tests/integration/test_lists/qa/llm_perf_sanity.yml @@ -14,28 +14,15 @@ trt_llm_release_perf_sanity_test: - '*h20*' tests: # E2E trtllm-bench - - perf/test_perf.py::test_perf[gpt_350m_moe-bench-float16-maxbs:64-input_output_len:128,128] - # E2E BERT - - perf/test_perf.py::test_perf[bert_large-bench-float16-maxbs:32-input_len:128+512] - - perf/test_perf.py::test_perf[roberta_base-bench-float16-maxbs:32-input_len:128+512] - - # Common models for all GPUs - - perf/test_perf.py::test_perf[starcoder2_3b-bench-float16-maxbs:1-input_output_len:512,200-reqs:10] - - perf/test_perf.py::test_perf[mamba_130m-bench-float16-input_output_len:128,128] - - perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:128,128] - - # E2E ENC-DEC - - perf/test_perf.py::test_perf[mbart_large_50_many_to_one_mmt-cppmanager-exe-plugin_ifb-float16-input_output_len:128,20] - - perf/test_perf.py::test_perf[bart_large_cnn-bench-float16-input_output_len:128,20] - - perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20] - - perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20] - - perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20] - - perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20] #llama_v3.1_8b_instruct #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:512,32] + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:500,2000] + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:512,32] + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:500,2000] + - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:512,32] #pytorch backend - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128] # Phi-4-multimodal-instruct @@ -44,38 +31,11 @@ trt_llm_release_perf_sanity_test: - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct-bench-pytorch-bfloat16-input_output_len:128,128] # Ministral-8B - perf/test_perf.py::test_perf[ministral_8b-bench-pytorch-bfloat16-input_output_len:500,2000-reqs:500-con:250] + - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:500,2000] # Test list validation - test_list_validation.py::test_list_validation -# Tests for GPUs with memory > 25000MB -- condition: - ranges: - system_gpu_count: - gte: 1 - gpu_memory: - gt: 25000 - wildcards: - gpu: - - '*h100*' - - '*h200*' - - '*a100*' - - '*l40s*' - - '*l20*' - - '*h20*' - tests: - # E2E gptManagerBenchmark IFB - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-cppmanager-exe-static_batching-plugin_ifb-float16-bs:8+64-input_output_len:128,128+512,32] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-cppmanager-exe-plugin_ifb-bfloat16-gwp:0.0-input_output_len:128,128+512,32] - #llama_v3.1_8b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32] - #pytorch backend - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:512,32] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:512,32] - - perf/test_perf.py::test_perf[qwen2_7b_instruct-bench-float16-input_output_len:128,128] # FP8 specific tests - condition: @@ -90,15 +50,14 @@ trt_llm_release_perf_sanity_test: - '*h20*' tests: #llama_v3.1_8b_instruct_fp8 - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:fp8] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32-quant:fp8] #pytorch backend - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32] - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:8-con:1] - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:500,2000-reqs:8-con:1] - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:8-con:1] + - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250] + - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-input_output_len:500,2000-reqs:500-con:250] # Tests for systems with 2+ GPUs - condition: @@ -114,13 +73,7 @@ trt_llm_release_perf_sanity_test: - '*l20*' - '*h20*' tests: - - perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] - - perf/test_perf.py::test_perf[flan_t5_large-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] #llama_v3.1_8b_instruct - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128-gpus:2] #pytorch backend - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2] @@ -142,10 +95,7 @@ trt_llm_release_perf_sanity_test: - '*l20*' - '*h20*' tests: - - perf/test_perf.py::test_perf[llama_v3.1_8b-cppmanager-exe-plugin_ifb-float16-mp-input_output_len:128,128-quant:fp8-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-quant:fp8-gpus:2] - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2] + #mixtral_8x7b_v0.1_fp8 pytorch backend - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2] # Tests for systems with 2+ GPUs and high memory @@ -181,11 +131,9 @@ trt_llm_release_perf_sanity_test: tests: #llama_v3.1_70b #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4] #pytorch backend + - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4] - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4] - - perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-ootb_except_mha-float16-input_output_len:128,128-gpus:4] - - perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-exe-plugin_ifb-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4] # FP8 specific tests - condition: @@ -201,6 +149,7 @@ trt_llm_release_perf_sanity_test: - '*l40s*' - '*h20*' tests: + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4] @@ -220,16 +169,11 @@ trt_llm_release_perf_sanity_test: - '*h20*' tests: #llama_v3.1_70b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8] #pytorch backend - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8] - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8] - - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:1-input_output_len:128,128-reqs:10-gpus:8] - # FP8 tests for systems with 8+ GPUs - condition: @@ -247,13 +191,13 @@ trt_llm_release_perf_sanity_test: tests: #llama_v3.1_70b - #trt backend - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:128,128-quant:fp8-gpus:8] #pytorch backend - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:512,32-quant:fp8-gpus:8] #llama_v3.3_70b_instruct_fp8 #pytorch backend - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8] + - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:64-gpus:8] + - condition: terms: @@ -270,4 +214,9 @@ trt_llm_release_perf_sanity_test: tests: - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:128,128] - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-streaming-pytorch-float8-input_output_len:128,128] - - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:128,128-con:256-ep:8-gpus:8] + - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500] + - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.6-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8] TIMEOUT(100) + - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(45) From 2e90b0b550f6581dd1266a1b432824e90db1c7e0 Mon Sep 17 00:00:00 2001 From: "Pengbo Wang @ NVIDIA" <221450789+pengbowang-nv@users.noreply.github.com> Date: Thu, 7 Aug 2025 09:47:45 +0800 Subject: [PATCH 013/186] [None][fix] Explicitly add tiktoken as required by kimi k2 (#6663) --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index c928e9836b..a7748aa3d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -62,4 +62,5 @@ blake3 llguidance==0.7.29 soundfile triton==3.3.1; platform_machine == "x86_64" +tiktoken blobfile From f7f46a50175811c10b1d9ced8c102eb43f563133 Mon Sep 17 00:00:00 2001 From: Guoming Zhang <137257613+nv-guomingz@users.noreply.github.com> Date: Thu, 7 Aug 2025 10:01:42 +0800 Subject: [PATCH 014/186] doc: remove the outdated features which marked as Experimental (#5995) Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com> --- README.md | 2 +- docs/source/advanced/disaggregated-service.md | 4 ++-- docs/source/advanced/gpt-attention.md | 2 -- docs/source/advanced/speculative-decoding.md | 2 +- docs/source/architecture/model-weights-loader.md | 2 +- docs/source/performance/perf-benchmarking.md | 9 --------- docs/source/reference/precision.md | 3 +-- docs/source/torch.md | 5 ++--- examples/auto_deploy/README.md | 4 ++-- examples/disaggregated/README.md | 2 +- examples/eagle/README.md | 1 - examples/models/core/deepseek_v3/README.md | 6 +++--- examples/models/core/llama/README.md | 4 ++-- examples/sample_weight_stripping/README.md | 4 ++-- 14 files changed, 18 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index bb58d309a5..5ab7fb51b7 100644 --- a/README.md +++ b/README.md @@ -253,5 +253,5 @@ Deprecation is used to inform developers that some APIs and tools are no longer ## Useful Links - [Quantized models on Hugging Face](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4): A growing collection of quantized (e.g., FP8, FP4) and optimized LLMs, including [DeepSeek FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), ready for fast inference with TensorRT-LLM. - [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo): A datacenter scale distributed inference serving framework that works seamlessly with TensorRT-LLM. -- [AutoDeploy](./examples/auto_deploy/README.md): An experimental backend for TensorRT-LLM to simplify and accelerate the deployment of PyTorch models. +- [AutoDeploy](./examples/auto_deploy/README.md): A prototype backend for TensorRT-LLM to simplify and accelerate the deployment of PyTorch models. - [WeChat Discussion Group](https://github.com/NVIDIA/TensorRT-LLM/issues/5359): A real-time channel for TensorRT-LLM Q&A and news. diff --git a/docs/source/advanced/disaggregated-service.md b/docs/source/advanced/disaggregated-service.md index e5c4a19ba4..d8e376d62c 100644 --- a/docs/source/advanced/disaggregated-service.md +++ b/docs/source/advanced/disaggregated-service.md @@ -1,10 +1,10 @@ (disaggregated-service)= -# Disaggregated-Service (Experimental) +# Disaggregated-Service (Prototype) ```{note} Note: -This feature is currently experimental, and the related API is subjected to change in future versions. +This feature is currently in prototype, and the related API is subjected to change in future versions. ``` Currently TRT-LLM supports `disaggregated-service`, where the context and generation phases of a request can run on different executors. TRT-LLM's disaggregated service relies on the executor API, please make sure to read the [executor page](executor.md) before reading the document. diff --git a/docs/source/advanced/gpt-attention.md b/docs/source/advanced/gpt-attention.md index 9fa1ae9b43..760637aed4 100644 --- a/docs/source/advanced/gpt-attention.md +++ b/docs/source/advanced/gpt-attention.md @@ -112,8 +112,6 @@ printed. #### XQA Optimization Another optimization for MQA/GQA in generation phase called XQA optimization. -It is still experimental feature and support limited configurations. LLAMA2 70B -is one model that it supports. Support matrix of the XQA optimization: - FP16 / BF16 compute data type. diff --git a/docs/source/advanced/speculative-decoding.md b/docs/source/advanced/speculative-decoding.md index 85a87ae062..5b52c8e8a7 100644 --- a/docs/source/advanced/speculative-decoding.md +++ b/docs/source/advanced/speculative-decoding.md @@ -168,7 +168,7 @@ TensorRT-LLM implements the ReDrafter model such that logits prediction, beam se The EAGLE approach enhances the single-model Medusa method by predicting and verifying tokens using the same model. Similarly to ReDrafter, it predicts draft tokens using a recurrent predictor where each draft token depends on the previous one. However, unlike ReDrafter, it uses a single-layer transformer model to predict draft tokens from previous hidden states and decoded tokens. In the EAGLE-1 decoding tree needs to be known during the decoding. In the EAGLE-2 this tree is asssembled during the execution by searching for the most probable hypothesis along the beam. -Similarly to ReDrafter, TensorRT-LLM implements the EAGLE model such that logits prediction, draft tokens acceptance and draft token generation are performed inside of the TensorRT engine. EAGLE-1 and EAGLE-2 are both supported, while EAGLE-2 is currently in the experimental stage. Please, visit the [EAGLE README](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/eagle/README.md) for information about building and running the model. +Similarly to ReDrafter, TensorRT-LLM implements the EAGLE model such that logits prediction, draft tokens acceptance and draft token generation are performed inside of the TensorRT engine(EAGLE-1 and EAGLE-2 are both supported). Please, visit the [EAGLE README](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/eagle/README.md) for information about building and running the model. ### Disaggregated Serving diff --git a/docs/source/architecture/model-weights-loader.md b/docs/source/architecture/model-weights-loader.md index eb393d4a7d..361c385349 100644 --- a/docs/source/architecture/model-weights-loader.md +++ b/docs/source/architecture/model-weights-loader.md @@ -249,7 +249,7 @@ for tllm_key, param in tqdm(trtllm_model.named_parameters()): In this mode, every precision require user's own support. ## Trouble shooting -The weights loader is an experimental feature for now, and is enabled for LLaMA family models and Qwen models by default. +The weights loader is enabled for LLaMA family models and Qwen models by default with TensorRT flow only. If users are encountered with failure caused by `ModelWeightsLoader`, a workaround is passing environmental variable `TRTLLM_DISABLE_UNIFIED_CONVERTER=1` to disable the model weights loader and fallback to the legacy path. diff --git a/docs/source/performance/perf-benchmarking.md b/docs/source/performance/perf-benchmarking.md index 814e27b3d3..a7ecc86f26 100644 --- a/docs/source/performance/perf-benchmarking.md +++ b/docs/source/performance/perf-benchmarking.md @@ -236,15 +236,6 @@ The following command builds an FP8 quantized engine by specifying the engine tu trtllm-bench --model meta-llama/Llama-3.1-8B build --quantization FP8 --max_seq_len 4096 --max_batch_size 1024 --max_num_tokens 2048 ``` -- [Experimental] Build engine with target ISL/OSL for optimization: -In this experimental mode, you can provide hints to `trtllm-bench`'s tuning heuristic to optimize the engine on specific ISL and OSL targets. -Generally, the target ISL and OSL aligns with the average ISL and OSL of the dataset, but you can experiment with different values to optimize the engine using this mode. -The following command builds an FP8 quantized engine and optimizes for ISL:OSL targets of 128:128. - -```shell -trtllm-bench --model meta-llama/Llama-3.1-8B build --quantization FP8 --max_seq_len 4096 --target_isl 128 --target_osl 128 -``` - #### Parallelism Mapping Support The `trtllm-bench build` subcommand supports combinations of tensor-parallel (TP) and pipeline-parallel (PP) mappings as long as the world size (`tp_size x pp_size`) `<=` `8`. The parallelism mapping in build subcommad is controlled by `--tp_size` and `--pp_size` options. The following command builds an engine with TP2-PP2 mapping. diff --git a/docs/source/reference/precision.md b/docs/source/reference/precision.md index 2d30c9053a..b31eff6d62 100644 --- a/docs/source/reference/precision.md +++ b/docs/source/reference/precision.md @@ -103,8 +103,7 @@ Python function, for details. This release includes examples of applying GPTQ to [GPT-NeoX](source:examples/models/core/gpt) and [LLaMA-v2](source:examples/models/core/llama), as well as an example of using AWQ with -[GPT-J](source:examples/models/contrib/gpt). Those examples are experimental implementations and -are likely to evolve in a future release. +[GPT-J](source:examples/models/contrib/gptj). ## FP8 (Hopper) diff --git a/docs/source/torch.md b/docs/source/torch.md index b04c98db1d..c3283b5290 100644 --- a/docs/source/torch.md +++ b/docs/source/torch.md @@ -2,10 +2,9 @@ ```{note} Note: -This feature is currently experimental, and the related API is subjected to change in future versions. +This feature is currently in beta, and the related API is subjected to change in future versions. ``` - -To enhance the usability of the system and improve developer efficiency, TensorRT-LLM launches a new experimental backend based on PyTorch. +To enhance the usability of the system and improve developer efficiency, TensorRT-LLM launches a new backend based on PyTorch. The PyTorch backend of TensorRT-LLM is available in version 0.17 and later. You can try it via importing `tensorrt_llm._torch`. diff --git a/examples/auto_deploy/README.md b/examples/auto_deploy/README.md index 399d31ce36..cba226e731 100644 --- a/examples/auto_deploy/README.md +++ b/examples/auto_deploy/README.md @@ -6,7 +6,7 @@
-AutoDeploy is an experimental feature in beta stage designed to simplify and accelerate the deployment of PyTorch models, including off-the-shelf models like those from Hugging Face, to TensorRT-LLM. It automates graph transformations to integrate inference optimizations such as tensor parallelism, KV-caching and quantization. AutoDeploy supports optimized in-framework deployment, minimizing the amount of manual modification needed. +AutoDeploy is a prototype feature in beta stage designed to simplify and accelerate the deployment of PyTorch models, including off-the-shelf models like those from Hugging Face, to TensorRT-LLM. It automates graph transformations to integrate inference optimizations such as tensor parallelism, KV-caching and quantization. AutoDeploy supports optimized in-framework deployment, minimizing the amount of manual modification needed. ______________________________________________________________________ @@ -450,4 +450,4 @@ the current progress in AutoDeploy and where you can help. ## Disclaimer -This project is in active development and is currently in an early (beta) stage. The code is experimental, subject to change, and may include backward-incompatible updates. While we strive for correctness, we provide no guarantees regarding functionality, stability, or reliability. Use at your own risk. +This project is in active development and is currently in an early (beta) stage. The code is in prototype, subject to change, and may include backward-incompatible updates. While we strive for correctness, we provide no guarantees regarding functionality, stability, or reliability. Use at your own risk. diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md index 99bd3de208..713e69e6be 100644 --- a/examples/disaggregated/README.md +++ b/examples/disaggregated/README.md @@ -83,7 +83,7 @@ Or using the provided client parsing the prompts from a file and sending request python3 ./clients/disagg_client.py -c disagg_config.yaml -p ./clients/prompts.json -e chat ``` -## Dynamic scaling (Experimental) +## Dynamic scaling (Prototype) Currently, trtllm supports dynamic addition and removal of servers by leveraging ETCD. To enable this feature, you should start the context and generation servers with an additional flag ```--metadata_server_config_file``` and ```--server_role```. Before launching the context and generation servers, you should first start the ETCD server. By default, the ETCD server listens for client requests at ```localhost:2379```. diff --git a/examples/eagle/README.md b/examples/eagle/README.md index 637223afb9..0b103ca40e 100644 --- a/examples/eagle/README.md +++ b/examples/eagle/README.md @@ -98,7 +98,6 @@ To run non-greedy sampling and use typical acceptance, set `--eagle_posterior_th `--temperature` can be specified as well. When no `--eagle_posterior_threshold` is specified or `--temperature=0.0` is set, greedy sampling is used. #### Run EAGLE-2 -**EAGLE-2 is still under the experimental stage.** EAGLE-2 can be enabled with 2 runtime flags (`--eagle_use_dynamic_tree` and `--eagle_dynamic_tree_max_top_k=N`). The same engine can be used for EAGLE-1 and EAGLE-2. Eagle choices must not be set in case of EAGLE-2. EAGLE-2 will generate the tree corresponding to choices dynamically in the runtime. For more details, please refer to [EAGLE-2 paper](https://arxiv.org/pdf/2406.16858). diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md index 3f05358805..2efe14b986 100644 --- a/examples/models/core/deepseek_v3/README.md +++ b/examples/models/core/deepseek_v3/README.md @@ -30,7 +30,7 @@ Please refer to [this guide](https://nvidia.github.io/TensorRT-LLM/installation/ - [trtllm-serve](#trtllm-serve) - [Disaggregated Serving](#disaggregated-serving) - [Dynamo](#dynamo) - - [tensorrtllm\_backend for triton inference server (Experimental)](#tensorrtllm_backend-for-triton-inference-server-experimental) + - [tensorrtllm\_backend for triton inference server (Prototype)](#tensorrtllm_backend-for-triton-inference-server-prototype) - [Advanced Usages](#advanced-usages) - [Multi-node](#multi-node) - [mpirun](#mpirun) @@ -392,8 +392,8 @@ settings for your specific use case. NVIDIA Dynamo is a high-throughput low-latency inference framework designed for serving generative AI and reasoning models in multi-node distributed environments. Dynamo supports TensorRT-LLM as one of its inference engine. For details on how to use TensorRT-LLM with Dynamo please refer to [LLM Deployment Examples using TensorRT-LLM](https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/README.md) -### tensorrtllm_backend for triton inference server (Experimental) -To serve the model using [tensorrtllm_backend](https://github.com/triton-inference-server/tensorrtllm_backend.git), make sure the version is v0.19+ in which the pytorch path is added as an experimental feature. +### tensorrtllm_backend for triton inference server (Prototype) +To serve the model using [tensorrtllm_backend](https://github.com/triton-inference-server/tensorrtllm_backend.git), make sure the version is v0.19+ in which the pytorch path is added as a prototype feature. The model configuration file is located at https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/llmapi/tensorrt_llm/1/model.yaml diff --git a/examples/models/core/llama/README.md b/examples/models/core/llama/README.md index bef4f60123..b888b287b0 100644 --- a/examples/models/core/llama/README.md +++ b/examples/models/core/llama/README.md @@ -676,7 +676,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_2gpu_fp8 \ The peak GPU memory consumption when doing FP8 quantizaton is more than 210GB (there is also some activation memory occupation when doing calibration). So you need a node with at least 4 H100(A100) to run the quantization command. After quantization, 2 GPUs are okay to for building and run. -Experimental: use FP8 GEMV to optimize performance in FP8 small-batch-size cases. +Note: use FP8 GEMV to optimize performance in FP8 small-batch-size cases. ```bash # Quantize HF LLaMA 7B into FP8 and export trtllm checkpoint @@ -694,7 +694,7 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_fp8 \ --gemm_plugin fp8 ``` -**Note**: FP8 gemm plugin is an experimental feature aimed to improve performance in small-batch-size cases(e.g. BS<=4). Although inputs with batch size larger than 4 can be correctly inferenced, the performance may decrease as batch size grows. +**Note**: FP8 gemv plugin uses CUDA cores to compute, by contrast to Tensor Core gemm kernel within cuBLAS. Over last year, as cuBLAS have improved their performance by a lot under small M case for Hopper(sm90), FP8 gemv kernel may or may not surpass cuBLAS, depending on specific gemm problem shape. Nonetheless, we still strongly recommend FP8 gemv kernel for Ada (sm89) as cuBLAS still falls behind gemv on it. ### Groupwise quantization (AWQ/GPTQ) One can enable AWQ/GPTQ INT4 weight only quantization with these options when building engine with `trtllm-build`: diff --git a/examples/sample_weight_stripping/README.md b/examples/sample_weight_stripping/README.md index bd28a60b84..a005f0904b 100644 --- a/examples/sample_weight_stripping/README.md +++ b/examples/sample_weight_stripping/README.md @@ -12,7 +12,7 @@ * [Llama-7b FP16 + WoQ INT8](#llama-7b-fp16-woq-int8) * [Llama2-70b FP8 with TP=2](#llama2-70b-fp8-with-tp2) - [Engine Plan File Size Results](#engine-plan-file-size-results) -- [Experimental](#experimental) +- [Prototype](#prototype) * [Checkpoint Pruner](#checkpoint-pruner) * [Pruning a TensorRT-LLM Checkpoint](#pruning-a-tensorrt-llm-checkpoint) @@ -239,7 +239,7 @@ python3 ../summarize.py --engine_dir engines/llama2-70b-hf-fp8-tp2.refit \ |llama-7b FP16 + WoQ INT8 | 6.54GB | 28.69MB | |llama2-70b FP8 + TP=2 | 64.78GB | 60.61MB | -## Experimental +## Prototype ### Checkpoint Pruner The checkpoint pruner allows you to strip `Conv` and `Gemm` weights out of a TensorRT-LLM [checkpoint](https://nvidia.github.io/TensorRT-LLM/latest/architecture/checkpoint.html). Since these make up the vast majority of weights, the pruner will decrease the size of your checkpoint up to 99%. From 157ea77549c76a7ab17b2f28e7194340a8e6b293 Mon Sep 17 00:00:00 2001 From: YueWeng <25103990+yweng0828@users.noreply.github.com> Date: Thu, 7 Aug 2025 10:25:17 +0800 Subject: [PATCH 015/186] [https://nvbugs/5375966][chore] Unwaive test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one (#6658) Signed-off-by: Yue Weng <25103990+yweng0828@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index f5f55f8f3c..6bb4398186 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -232,7 +232,6 @@ accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/ examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoder] SKIP (https://nvbugs/5355128) examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus] SKIP (https://nvbugs/5355128) stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] SKIP (https://nvbugs/5375646) -full:GH200/disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5375966) accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5375620) test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8] SKIP (https://nvbugs/5380570) test_e2e.py::test_ptp_quickstart_advanced_8gpus[Nemotron-Ultra-253B-nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1] SKIP (https://nvbugs/5380570) From 3e41e6c077756a7487b16372ccfa5088bce37f1b Mon Sep 17 00:00:00 2001 From: Yiqing Yan Date: Thu, 7 Aug 2025 11:00:15 +0800 Subject: [PATCH 016/186] [TRTLLM-6892][infra] Run guardwords scan first in Release Check stage (#6659) Signed-off-by: Yiqing Yan Co-authored-by: Yanchao Lu --- jenkins/L0_MergeRequest.groovy | 43 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 48e68efea3..95522b2bf2 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -386,30 +386,11 @@ def launchReleaseCheck(pipeline) -y""") sh "pip3 config set global.break-system-packages true" sh "git config --global --add safe.directory \"*\"" - // Step 1: cloning tekit source code + // Step 1: Clone TRT-LLM source codes trtllm_utils.checkoutSource(LLM_REPO, env.gitlabCommit, LLM_ROOT, true, true) sh "cd ${LLM_ROOT} && git config --unset-all core.hooksPath" - trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${LLM_ROOT} && python3 -u scripts/release_check.py || (git restore . && false)") - // Step 2: build tools - withEnv(['GONOSUMDB=*.nvidia.com']) { - withCredentials([ - gitUsernamePassword( - credentialsId: 'svc_tensorrt_gitlab_read_api_token', - gitToolName: 'git-tool' - ), - string( - credentialsId: 'default-git-url', - variable: 'DEFAULT_GIT_URL' - ) - ]) { - sh "go install ${DEFAULT_GIT_URL}/TensorRT/Infrastructure/licensechecker/cmd/license_checker@v0.3.0" - } - } - // Step 3: Run license check - sh "cd ${LLM_ROOT}/cpp && /go/bin/license_checker -config ../jenkins/license_cpp.json include tensorrt_llm" - - // Step 4: Run guardwords scan + // Step 2: Run guardwords scan def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/) if (env.alternativeTRT || isOfficialPostMergeJob) { trtllm_utils.checkoutSource(SCAN_REPO, SCAN_COMMIT, SCAN_ROOT, true, true) @@ -434,6 +415,26 @@ def launchReleaseCheck(pipeline) echo "Guardwords Scan Results: https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/guardwords-scan-results/scan.log" } } + + // Step 3: Run pre-commit checks + trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${LLM_ROOT} && python3 -u scripts/release_check.py || (git restore . && false)") + + // Step 4: Run license check + withEnv(['GONOSUMDB=*.nvidia.com']) { + withCredentials([ + gitUsernamePassword( + credentialsId: 'svc_tensorrt_gitlab_read_api_token', + gitToolName: 'git-tool' + ), + string( + credentialsId: 'default-git-url', + variable: 'DEFAULT_GIT_URL' + ) + ]) { + sh "go install ${DEFAULT_GIT_URL}/TensorRT/Infrastructure/licensechecker/cmd/license_checker@v0.3.0" + } + } + sh "cd ${LLM_ROOT}/cpp && /go/bin/license_checker -config ../jenkins/license_cpp.json include tensorrt_llm" } def image = "urm.nvidia.com/docker/golang:1.22" From ee471df07c1644ecfd8784e21bb803033fb5c342 Mon Sep 17 00:00:00 2001 From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> Date: Thu, 7 Aug 2025 11:36:05 +0800 Subject: [PATCH 017/186] [None][chore] optimize kv cache transfer for context TEP and gen DEP (#6657) Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> --- .../batch_manager/cacheFormatter.cpp | 7 +++--- .../batch_manager/mlaCacheFormatter.cpp | 22 +++++++++++++------ .../batch_manager/cacheTransceiverTest.cpp | 8 ++++--- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp index 2edfd5f77a..d95ca1b412 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp @@ -75,7 +75,6 @@ BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmReques bool CacheFormatter::needSendCache( CacheState const& selfConfig, CacheState const& destConfig, runtime::SizeType32 selfIdx) { - // int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism; auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx); if (targetInfo.mDupHeadFactor <= 1) { @@ -90,8 +89,9 @@ bool CacheFormatter::needSendCache( = selfConfig.getParallelConfig().mTensorParallelism / selfConfig.getParallelConfig().mDPsize; selfTpRankInDpGroup = selfTpRank % selfTPNumInDPGroup; } + int destDPRank = destConfig.getParallelConfig().mEnableAttentionDP ? destConfig.getParallelConfig().mDPrank : 0; - return selfTpRankInDpGroup % targetInfo.mDupHeadFactor == 0; + return (destDPRank % targetInfo.mDupHeadFactor) == (selfTpRankInDpGroup % targetInfo.mDupHeadFactor); } void checkAlternateWindow(BaseKVCacheManager* cacheManager, BaseCacheFormatter::CacheState const& selfConfig, @@ -128,11 +128,12 @@ std::vector CacheFormatter::pickRecvConnections( return ret; } TLLM_CHECK(numConnections == targetInfo.mIRanks.size()); + int selfDPRank = selfConfig.getParallelConfig().mEnableAttentionDP ? selfConfig.getParallelConfig().mDPrank : 0; std::vector ret; for (int i = 0; i < targetInfo.mDomainTPSize; i++) { - if (i % targetInfo.mPeerDupHeadFactor == 0) + if ((i % targetInfo.mPeerDupHeadFactor) == (selfDPRank % targetInfo.mPeerDupHeadFactor)) { for (int j = 0; j < targetInfo.mDomainPPSize; j++) { diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp index 810edd6f45..824a31129f 100644 --- a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp +++ b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp @@ -45,10 +45,12 @@ std::vector MLACacheFormatter::pickRecvConnections( auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx); TLLM_CHECK(numConnections == targetInfo.mIRanks.size()); std::vector ret; - // targetInfo , mRanks [tpranks, dpranks] + // targetInfo , mRanks [tpranks, ppranks] + int dpRank = selfConfig.getParallelConfig().mEnableAttentionDP ? selfConfig.getParallelConfig().mDPrank : 0; + for (int i = 0; i < targetInfo.mDomainPPSize; i++) { - ret.push_back(i); + ret.push_back(i + (dpRank % (targetInfo.mDomainTPSize)) * targetInfo.mDomainPPSize); } return ret; } @@ -58,19 +60,24 @@ bool MLACacheFormatter::needSendCache( { int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism; + int destTPNumInDPGroup = destConfig.getParallelConfig().mEnableAttentionDP + ? destConfig.getParallelConfig().mTensorParallelism / destConfig.getParallelConfig().mDPsize + : destConfig.getParallelConfig().mTensorParallelism; + int destDPRank = destConfig.getParallelConfig().mEnableAttentionDP ? destConfig.getParallelConfig().mDPrank : 0; + if (selfConfig.getParallelConfig().mEnableAttentionDP) { int selfTPNumInDPGroup = selfConfig.getParallelConfig().mTensorParallelism / selfConfig.getParallelConfig().mDPsize; - int destTPNumInDPGroup = destConfig.getParallelConfig().mEnableAttentionDP - ? destConfig.getParallelConfig().mTensorParallelism / destConfig.getParallelConfig().mDPsize - : destConfig.getParallelConfig().mTensorParallelism; + int selfTPrankINDPGroup = selfTpRank % selfTPNumInDPGroup; if (selfTPNumInDPGroup <= destTPNumInDPGroup) { return true; } - return selfTPrankINDPGroup % (selfTPNumInDPGroup / destTPNumInDPGroup) == 0; + + int dupHeadFactor = selfTPNumInDPGroup / destTPNumInDPGroup; + return selfTPrankINDPGroup % dupHeadFactor == destDPRank; } int destTPNum = destConfig.getParallelConfig().mEnableAttentionDP @@ -81,7 +88,8 @@ bool MLACacheFormatter::needSendCache( { return true; } - return selfTpRank % (selfTPNum / destTPNum) == 0; + int dupHeadFactor = selfTPNum / destTPNum; + return selfTpRank % dupHeadFactor == destDPRank; } void MLACacheFormatter::format(TransferSession& session) diff --git a/cpp/tests/batch_manager/cacheTransceiverTest.cpp b/cpp/tests/batch_manager/cacheTransceiverTest.cpp index 99c40f810f..af916359d0 100644 --- a/cpp/tests/batch_manager/cacheTransceiverTest.cpp +++ b/cpp/tests/batch_manager/cacheTransceiverTest.cpp @@ -1457,12 +1457,15 @@ TEST(targetTest, CacheStateNODP) verifyContext( /*contextRank*/ 0, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectNeedSend*/ true); + verifyContext( /*contextRank*/ 1, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectNeedSend*/ false); + verifyContext( /*contextRank*/ 2, /*expectRanks*/ {1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectNeedSend*/ true); verifyContext( /*contextRank*/ 3, /*expectRanks*/ {1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectNeedSend*/ false); + verifyContext( /*contextRank*/ 4, /*expectRanks*/ {2}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectNeedSend*/ true); verifyContext( @@ -1474,7 +1477,6 @@ TEST(targetTest, CacheStateNODP) contextTP = 2; genTP = 4; - verifyContext( /*contextRank*/ 0, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 2, /*expectNeedSend*/ true); verifyContext(/*contextRank*/ 1, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 2, @@ -1564,13 +1566,13 @@ TEST(targetTest, CacheStateContextDP) /*expectNeedSend*/ true); verifyContext( /*contextRank*/ 0, /*generationRank*/ 1, /*expectRanks*/ {1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, - /*expectNeedSend*/ true); + /*expectNeedSend*/ false); verifyContext( /*contextRank*/ 1, /*generationRank*/ 0, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectNeedSend*/ false); verifyContext( /*contextRank*/ 1, /*generationRank*/ 1, /*expectRanks*/ {1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, - /*expectNeedSend*/ false); + /*expectNeedSend*/ true); verifyContext( /*contextRank*/ 2, /*generationRank*/ 0, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectNeedSend*/ false); From 5fa1914cabd50ddc646301b34f531d3d4540004c Mon Sep 17 00:00:00 2001 From: Yiqing Yan Date: Thu, 7 Aug 2025 13:39:49 +0800 Subject: [PATCH 018/186] [None][chore] Bump version to 1.1.0rc0 (#6651) Signed-off-by: Yiqing Yan --- README.md | 2 +- examples/constraints.txt | 2 +- tensorrt_llm/version.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5ab7fb51b7..83cad6eb02 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ TensorRT-LLM [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/) [![cuda](https://img.shields.io/badge/cuda-12.9.1-green)](https://developer.nvidia.com/cuda-downloads) [![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt) -[![version](https://img.shields.io/badge/release-1.0.0rc6-green)](./tensorrt_llm/version.py) +[![version](https://img.shields.io/badge/release-1.1.0rc0-green)](./tensorrt_llm/version.py) [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE) [Architecture](./docs/source/torch/arch_overview.md)   |   [Performance](./docs/source/performance/perf-overview.md)   |   [Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)   |   [Documentation](./docs/source/)   |   [Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap) diff --git a/examples/constraints.txt b/examples/constraints.txt index 756d3b8fd2..bd343a675d 100644 --- a/examples/constraints.txt +++ b/examples/constraints.txt @@ -1,3 +1,3 @@ -tensorrt_llm==1.0.0rc6 +tensorrt_llm==1.1.0rc0 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py index 6feaea3d2e..ef6771a07f 100644 --- a/tensorrt_llm/version.py +++ b/tensorrt_llm/version.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.0.0rc6" +__version__ = "1.1.0rc0" From 85af62184b65f2759fc463ba1f48a8d6d799ed6a Mon Sep 17 00:00:00 2001 From: amitz-nv <203509407+amitz-nv@users.noreply.github.com> Date: Thu, 7 Aug 2025 09:05:36 +0300 Subject: [PATCH 019/186] [TRTLLM-6683][feat] Support LoRA reload CPU cache evicted adapter (#6510) Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- .../tensorrt_llm/batch_manager/llmRequest.h | 3 + cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 6 + .../batch_manager/peftCacheManager.cpp | 13 +- cpp/tensorrt_llm/executor/loraConfig.cpp | 27 ++-- .../nanobind/batch_manager/bindings.cpp | 3 +- .../pybind/batch_manager/bindings.cpp | 3 +- .../unit_tests/executor/loraConfigTest.cpp | 9 +- tensorrt_llm/_torch/pyexecutor/_util.py | 1 + tensorrt_llm/_torch/pyexecutor/llm_request.py | 2 + .../_torch/pyexecutor/resource_manager.py | 39 +++++- tensorrt_llm/_utils.py | 7 + tensorrt_llm/executor/worker.py | 6 +- .../unittest/_torch/test_resource_manager.py | 17 ++- tests/unittest/llmapi/test_llm.py | 52 ++++++-- tests/unittest/llmapi/test_llm_pytorch.py | 126 ++++++++---------- tests/unittest/utils/util.py | 93 +------------ 16 files changed, 187 insertions(+), 220 deletions(-) diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index 0d087d96c0..aedac8c2ac 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -2347,6 +2347,9 @@ public: void movePromptEmbeddingTableToGpu(runtime::BufferManager const& manager); void moveLoraWeightsToGpu(runtime::BufferManager const& manager); + + // Remove LoRA weights and LoRA config tensors + void removeLoraTensors(); }; } // namespace tensorrt_llm::batch_manager diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index a9a4aec5df..dcebc9c3ac 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -365,4 +365,10 @@ void LlmRequest::moveLoraWeightsToGpu(runtime::BufferManager const& manager) mLoraWeights = gpuLoraWeights; } +void LlmRequest::removeLoraTensors() +{ + mLoraWeights.reset(); + mLoraConfig.reset(); +} + } // namespace tensorrt_llm::batch_manager diff --git a/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp index f513f2a3a1..cc62bd3eb0 100644 --- a/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp @@ -591,10 +591,9 @@ SizeType32 PeftCacheManager::determineNumPages(std::shared_ptr llmRe TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); if (llmRequest->getLoraTaskId().has_value()) { - auto taskId = llmRequest->getLoraTaskId().value(); try { - return mHostLoraCache->determineNumPages(taskId); + return mHostLoraCache->determineNumPages(llmRequest->getLoraTaskId().value()); } catch (std::runtime_error& e) { @@ -602,16 +601,6 @@ SizeType32 PeftCacheManager::determineNumPages(std::shared_ptr llmRe { return mHostLoraCache->determineNumPages(llmRequest->getLoraConfig().value()); } - if (!llmRequest->getLoraWeights().has_value()) - { - auto const reqId = llmRequest->mRequestId; - std::string errMsg - = "Request ID " + std::to_string(reqId) + " has no LoRA adapter weights while configured with LoRA task " - + std::to_string(taskId) + " that's not found in LoRA CPU cache." - " Note that currently a request with LoRA task that was already loaded is sent without its LoRA weights to save its serialization, copy and deserialization," - " so if this LoRA task was evicted from LoRA CPU cache, then its reuse is currently not supported."; - throw PeftTaskNotCachedException(errMsg); - } throw; } } diff --git a/cpp/tensorrt_llm/executor/loraConfig.cpp b/cpp/tensorrt_llm/executor/loraConfig.cpp index 058b1a8671..c8499f36d4 100644 --- a/cpp/tensorrt_llm/executor/loraConfig.cpp +++ b/cpp/tensorrt_llm/executor/loraConfig.cpp @@ -27,26 +27,29 @@ LoraConfig::LoraConfig(IdType taskId, std::optional weights, std::option , mWeights(std::move(weights)) , mConfig(std::move(config)) { - if (mWeights.has_value() || mConfig.has_value()) + if (mConfig.has_value()) { - TLLM_CHECK_WITH_INFO(mWeights.has_value() && mConfig.has_value(), - "Request for LoRA inference must have both lora weights and lora config"); - - SizeType32 constexpr expectedWeightsDims = 2; SizeType32 constexpr expectedConfigDims = 2; + TLLM_CHECK_WITH_INFO( + mConfig.value().getShape().size() == expectedConfigDims, "Expected config tensor to have 2 dimensions"); + TLLM_CHECK_WITH_INFO(mConfig.value().getMemoryType() != MemoryType::kGPU + && mConfig.value().getMemoryType() != MemoryType::kUNKNOWN, + "Expected lora config to be in CPU memory"); + TLLM_CHECK_WITH_INFO( + mConfig.value().getDataType() == DataType::kINT32, "Expected lora config tensor to have type kINT32"); + } + if (mWeights.has_value()) + { + SizeType32 constexpr expectedWeightsDims = 2; + TLLM_CHECK_WITH_INFO( + mConfig.has_value(), "Request for LoRA inference with lora weights must also have lora config"); TLLM_CHECK_WITH_INFO( mWeights.value().getShape().size() == expectedWeightsDims, "Expected weights tensor to have 2 dimensions"); - TLLM_CHECK_WITH_INFO( - mConfig.value().getShape().size() == expectedConfigDims, "Expected config tensor to have 2 dimensions"); + TLLM_CHECK_WITH_INFO(mWeights.value().getMemoryType() != MemoryType::kGPU && mWeights.value().getMemoryType() != MemoryType::kUNKNOWN, "Expected lora weights to be in CPU memory"); - TLLM_CHECK_WITH_INFO(mConfig.value().getMemoryType() != MemoryType::kGPU - && mConfig.value().getMemoryType() != MemoryType::kUNKNOWN, - "Expected lora weights to be in CPU memory"); - TLLM_CHECK_WITH_INFO( - mConfig.value().getDataType() == DataType::kINT32, "Expected lora config tensor to have type kINT32"); TLLM_CHECK_WITH_INFO(mConfig.value().getShape()[0] == mWeights.value().getShape()[0], "Expected dim 0 of lora weights and lora config to have the same size"); diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp index 56fdbf14e9..2ac069616e 100644 --- a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp @@ -375,7 +375,8 @@ void initBindings(nb::module_& m) .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, nb::arg("manager")) .def("finish_by_reason", &tb::LlmRequest::finishByReason, nb::arg("finish_reason")) .def("set_first_scheduled_time", &tb::LlmRequest::setFirstScheduledTime) - .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, nb::arg("iter_counter")); + .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, nb::arg("iter_counter")) + .def("remove_lora_tensors", &tb::LlmRequest::removeLoraTensors); nb::class_(m, "SequenceSlotManager") .def(nb::init(), nb::arg("max_num_slots"), diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index f0d74f4f99..04faa90c2f 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -381,7 +381,8 @@ void initBindings(pybind11::module_& m) .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager")) .def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason")) .def("set_first_scheduled_time", &tb::LlmRequest::setFirstScheduledTime) - .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, py::arg("iter_counter")); + .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, py::arg("iter_counter")) + .def("remove_lora_tensors", &tb::LlmRequest::removeLoraTensors); py::classh(m, "SequenceSlotManager") .def(py::init(), py::arg("max_num_slots"), diff --git a/cpp/tests/unit_tests/executor/loraConfigTest.cpp b/cpp/tests/unit_tests/executor/loraConfigTest.cpp index 2859739f6e..6ce56cccbd 100644 --- a/cpp/tests/unit_tests/executor/loraConfigTest.cpp +++ b/cpp/tests/unit_tests/executor/loraConfigTest.cpp @@ -53,13 +53,12 @@ TEST(LoraConfigTest, invalidInputs) // This should work auto loraConfig = LoraConfig(1, weights, config); + // Having config only without weights is allowed + loraConfig = LoraConfig(1, std::nullopt, config); { - // Only one specified - testInvalid(1, std::nullopt, config, "must have both"); - - // Only one specified - testInvalid(1, weights, std::nullopt, "must have both"); + // Only weights specified without config - not allowed + testInvalid(1, weights, std::nullopt, "lora weights must also have lora config"); } { diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index b7204afeb4..21fa9f91c1 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -502,6 +502,7 @@ def create_py_executor_instance( ) peft_cache_manager = PeftCacheManager( peft_cache_config=executor_config.peft_cache_config, + lora_config=lora_config, model_config=model_binding_config, world_config=world_config, ) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 0fb1f06e96..8aa263bb03 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -285,6 +285,7 @@ class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest): self.py_logits_post_processors = kwargs.pop("py_logits_post_processors", None) + self.py_lora_path: str | None = kwargs.pop("py_lora_path", None) # Multimodal data self.py_multimodal_data = kwargs.pop("py_multimodal_data", None) if llm_request is not None: @@ -490,6 +491,7 @@ def executor_request_to_llm_request( if executor_request.lora_config is not None else None, lora_config=executor_request.lora_config.config if executor_request.lora_config is not None else None, + py_lora_path=getattr(executor_request, "py_lora_path", None), mrope_rotary_cos_sin=mrope_rotary_cos_sin, mrope_position_deltas=mrope_position_deltas, lookahead_config=None, diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index 9f44649b49..eb33f8aa5b 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -10,9 +10,10 @@ import torch import tensorrt_llm import tensorrt_llm.bindings from tensorrt_llm.bindings.BuildInfo import ENABLE_MULTI_DEVICE +from tensorrt_llm.lora_manager import LoraConfig, LoraManager, LoraModelConfig from tensorrt_llm.sampling_params import SamplingParams -from ..._utils import binding_dtype_size, nvtx_range +from ..._utils import binding_dtype_size, binding_to_str_dtype, nvtx_range from ...logger import logger from ...mapping import Mapping from .llm_request import (LlmRequest, LlmRequestState, SamplingConfig, @@ -1170,6 +1171,7 @@ class PeftCacheManager(BaseResourceManager): def __init__(self, peft_cache_config: PeftCacheConfig, + lora_config: LoraConfig, model_config: ModelConfig, world_config: WorldConfig | None = None): import tensorrt_llm.bindings as _tb @@ -1200,8 +1202,36 @@ class PeftCacheManager(BaseResourceManager): model_config=model_config, world_config=world_config, buffer_manager=buffer_manager) + self._lora_config = lora_config + self._lora_model_config = LoraModelConfig( + lora_config.lora_target_modules, + lora_config.trtllm_modules_to_hf_modules, model_config.hidden_size, + binding_to_str_dtype(model_config.data_type)) + self._lora_manager = LoraManager() def add_request_peft(self, request: LlmRequest): + if request.lora_task_id is not None: + is_task_cached = self.impl.is_task_cached(request.lora_task_id) + if is_task_cached: + # PeftCacheManager::addRequestPeft in CPP doesn't allow having only one of [config tensor, weights + # tensor] without the other. Since there's no need for any of them when the LoRA adapter is already + # cached, we can safely remove both from the request. + request.remove_lora_tensors() + elif request.lora_weights is None and request.py_lora_path: + self._lora_manager.load_from_ckpt( + [request.py_lora_path], + model_config=self._lora_model_config, + runtime_mapping=None, + uids=[request.lora_task_id], + ckpt_source=self._lora_config.lora_ckpt_source) + request.lora_weights = self._lora_manager.cpp_lora_weights[ + request.lora_task_id] + + # PeftCacheManager CPP implementation expects an extra dim at index 0 + if request.lora_weights is not None: + request.lora_weights = request.lora_weights.unsqueeze(0) + if request.lora_config is not None: + request.lora_config = request.lora_config.unsqueeze(0) self.impl.add_request_peft(request, True) def ensure_batch(self, @@ -1221,12 +1251,7 @@ class PeftCacheManager(BaseResourceManager): context_batch = scheduled_batch.context_requests generation_batch = scheduled_batch.generation_requests for req in context_batch: - if req.lora_weights is not None and req.lora_config is not None: - req.lora_weights = req.lora_weights.reshape( - [1] + list(req.lora_weights.shape)) - req.lora_config = req.lora_config.reshape( - [1] + list(req.lora_config.shape)) - self.impl.add_request_peft(req, True) + self.add_request_peft(req) py_lora_task_layer_module_configs = self.impl.ensure_batch( context_batch, generation_batch, False) diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py index 75be272791..d6cce43776 100644 --- a/tensorrt_llm/_utils.py +++ b/tensorrt_llm/_utils.py @@ -180,6 +180,7 @@ _str_to_binding_dtype_dict = dict( bool=DataType.BOOL, fp8=DataType.FP8, ) +_binding_to_str_dtype = {v: k for k, v in _str_to_binding_dtype_dict.items()} _binding_dtype_size = { DataType.INT64: 8, @@ -194,6 +195,12 @@ _binding_dtype_size = { } +def binding_to_str_dtype(binding_dtype) -> str: + ret = _binding_to_str_dtype.get(binding_dtype) + assert ret is not None, f'Unsupported binding dtype: {binding_dtype}' + return ret + + def binding_dtype_size(dtype: DataType): return _binding_dtype_size[dtype] diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 33ed146c9c..db8d84fcc8 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -372,6 +372,7 @@ class GenerationExecutorWorker(GenerationExecutor): def _enqueue_request(self, request: GenerationRequest) -> int: assert request.id is not None + py_lora_path = None if self._lora_manager is not None and request.lora_request is not None: adapter_in_cache = self._lora_manager.is_adapter_in_cpu_cache( request.lora_request.adapter_id) @@ -381,8 +382,8 @@ class GenerationExecutorWorker(GenerationExecutor): task_id=request.lora_request.adapter_id, weights=self._lora_manager.cpp_lora_weights[uid] if not adapter_in_cache else None, - config=self._lora_manager.cpp_lora_config[uid] - if not adapter_in_cache else None) + config=self._lora_manager.cpp_lora_config[uid]) + py_lora_path = request.lora_request.lora_path else: lora_config = None @@ -497,6 +498,7 @@ class GenerationExecutorWorker(GenerationExecutor): kv_cache_retention_config=request.kv_cache_retention_config, context_phase_params=context_phase_params, type=request_type) + executor_request.py_lora_path = py_lora_path if self._is_pytorch_backend and request.multimodal_params is not None: if request.multimodal_params.multimodal_data is not None: diff --git a/tests/unittest/_torch/test_resource_manager.py b/tests/unittest/_torch/test_resource_manager.py index da1dae84ba..21edd013da 100644 --- a/tests/unittest/_torch/test_resource_manager.py +++ b/tests/unittest/_torch/test_resource_manager.py @@ -5,11 +5,11 @@ import sys import unittest import numpy as np -import pytest import torch import tensorrt_llm import tensorrt_llm.bindings +from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest from tensorrt_llm._torch.pyexecutor.resource_manager import (KVCacheManager, PeftCacheConfig, PeftCacheManager) @@ -17,6 +17,7 @@ from tensorrt_llm.bindings import ModelConfig as ModelConfigCpp from tensorrt_llm.bindings import executor as tllm from tensorrt_llm.bindings.internal.batch_manager import \ PeftTaskNotCachedException +from tensorrt_llm.lora_manager import LoraConfig DataType = tensorrt_llm.bindings.DataType LoraModule = tensorrt_llm.bindings.LoraModule @@ -247,7 +248,7 @@ class TestResourceManager(unittest.TestCase): lora_config = torch.from_numpy(lora_config) input_tokens = [i + 1 for i in range(max_new_tokens)] - request = tensorrt_llm.bindings.internal.batch_manager.LlmRequest( + request = LlmRequest( request_id=request_id, max_new_tokens=max_new_tokens, input_tokens=input_tokens, @@ -261,15 +262,13 @@ class TestResourceManager(unittest.TestCase): return request def get_lora_data(self): - """Create mock LoRA weights and config that match the C++ validation expectations. + """Create mock LoRA weights and config. Returns: - tuple: (weights tensor, config tensor) formatted correctly for the C++ implementation. + tuple: (weights tensor, config tensor). """ lora_weights = np.load(self.TP1_WEIGHTS_PATH).astype(np.float16) - lora_weights = np.expand_dims(lora_weights, axis=0) lora_config = np.load(self.TP1_CONFIG_PATH) - lora_config = np.expand_dims(lora_config, axis=0) return lora_weights, lora_config def test_successful_mocked_peft_cache_manager_initialization(self): @@ -277,6 +276,7 @@ class TestResourceManager(unittest.TestCase): peft_cache_manager = PeftCacheManager( peft_cache_config=peft_cache_config, + lora_config=LoraConfig(), model_config=self.model_config, ) @@ -290,6 +290,7 @@ class TestResourceManager(unittest.TestCase): peft_cache_manager = PeftCacheManager( peft_cache_config=peft_cache_config, + lora_config=LoraConfig(), model_config=self.model_config, ) @@ -307,6 +308,7 @@ class TestResourceManager(unittest.TestCase): peft_cache_manager = PeftCacheManager( peft_cache_config=peft_cache_config, + lora_config=LoraConfig(), model_config=self.model_config, ) @@ -322,6 +324,7 @@ class TestResourceManager(unittest.TestCase): peft_cache_manager = PeftCacheManager( peft_cache_config=peft_cache_config, + lora_config=LoraConfig(), model_config=self.model_config, ) @@ -349,13 +352,13 @@ class TestResourceManager(unittest.TestCase): self.assertEqual(len(peft_table), self.num_lora_modules) - @pytest.mark.skip(reason="https://nvbugs/5324252") def test_put_get(self): """Test adding a request with properly configured LoRA weights and config.""" peft_cache_config = self.create_peft_cache_config() peft_cache_manager = PeftCacheManager( peft_cache_config=peft_cache_config, + lora_config=LoraConfig(), model_config=self.model_config, ) diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 2b7c606bf4..5e82d10b43 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -1459,20 +1459,7 @@ def llama_v2_13b_lora_from_dir_test_harness(**llm_kwargs): assert similar(output.outputs[0].text, ref) -@pytest.mark.parametrize( - "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeat_calls, repeats_per_call", - [ - # Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single - # llm.generate call, that's repeated twice. - ([ - 2, - ], 1, 2, 2, 3), - # Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU - # cache size < LoRA CPU cache size - ([2, 2, 2], 1, 3, 1, 1), - ]) -@skip_gpu_memory_less_than_40gb -def test_llama_7b_multi_lora_evict_load_new_adapters( +def _check_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call: list[int], max_loras: int, max_cpu_loras: int, repeat_calls: int, repeats_per_call: int): # For LoRA checkpoints without finetuned embedding and lm_head, we can either: @@ -1493,6 +1480,43 @@ def test_llama_7b_multi_lora_evict_load_new_adapters( fast_build=True) +@skip_gpu_memory_less_than_40gb +def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache(): + """Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single + llm.generate call, that's repeated twice. + """ # noqa: D205 + _check_llama_7b_multi_lora_evict_load_new_adapters( + lora_adapter_count_per_call=[2], + max_loras=1, + max_cpu_loras=2, + repeat_calls=2, + repeats_per_call=3) + + +@skip_gpu_memory_less_than_40gb +def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache(): + """Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU + cache size < LoRA CPU cache size. + """ # noqa: D205 + _check_llama_7b_multi_lora_evict_load_new_adapters( + lora_adapter_count_per_call=[2, 2, 2], + max_loras=1, + max_cpu_loras=3, + repeat_calls=1, + repeats_per_call=1) + + +@skip_gpu_memory_less_than_40gb +def test_llama_7b_multi_lora_read_from_cache_after_insert(): + """Test that loading and then using the same adapters loaded in cache works.""" + _check_llama_7b_multi_lora_evict_load_new_adapters( + lora_adapter_count_per_call=[3], + max_loras=3, + max_cpu_loras=3, + repeat_calls=2, + repeats_per_call=1) + + def test_llama_7b_peft_cache_config_affects_peft_cache_size(): """Tests that LLM arg of peft_cache_config affects the peft cache sizes. diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 13708aae3c..518772d6f6 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -20,9 +20,7 @@ from .test_llm import (_test_llm_capture_request_error, get_model_path, run_llm_abort_request, run_llm_with_postprocess_parallel_and_result_handler, tinyllama_logits_processor_test_harness) -from utils.util import (EnvVarsContextManager, force_ampere, - run_function_in_sub_process, similar, - skip_gpu_memory_less_than_40gb, +from utils.util import (force_ampere, similar, skip_gpu_memory_less_than_40gb, skip_gpu_memory_less_than_80gb, skip_gpu_memory_less_than_138gb) from utils.llm_data import llm_models_root @@ -313,20 +311,7 @@ def test_llama_7b_lora_default_modules() -> None: llm.shutdown() -@pytest.mark.parametrize( - "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeat_calls, repeats_per_call", - [ - # Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single - # llm.generate call, that's repeated twice. - ([ - 2, - ], 1, 2, 2, 3), - # Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU - # cache size < LoRA CPU cache size - ([2, 2, 2], 1, 3, 1, 1), - ]) -@skip_gpu_memory_less_than_40gb -def test_llama_7b_multi_lora_evict_load_new_adapters( +def _check_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call: list[int], max_loras: int, max_cpu_loras: int, repeat_calls: int, repeats_per_call: int): # For LoRA checkpoints without finetuned embedding and lm_head, we can either: @@ -347,60 +332,66 @@ def test_llama_7b_multi_lora_evict_load_new_adapters( cuda_graph_config=None) -@pytest.mark.parametrize( - "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeat_calls, repeats_per_call", - [ - # Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU - # cache over multiple llm.generate call repeated twice (two calls with the same requests): - # At the end of the 1st llm.generate call: - # The LoRA caches should contain adapters 1, 2 and shouldn't contain adapter 0 (it should have been evicted). - # So in the 2nd call, the worker should: - # - Send req0 with adapter 0 weights (because it was previously evicted) - # - Send the other two requests without their adapter weights as they're already in LoRA CPU cache - # Then, handling of req0 that has weights but not in the cache should evict one of the other two adapters from - # the cache, causing that evicted adapter's request to fail because its weights aren't with the request and - # aren't in LoRA cache. - ([ - 3, - ], 2, 2, 2, 1), - ]) @skip_gpu_memory_less_than_40gb -def test_llama_7b_multi_lora_load_previously_cpu_cache_evicted_adapter_fails( - lora_adapter_count_per_call: list[int], max_loras: int, - max_cpu_loras: int, repeat_calls: int, repeats_per_call: int): - """Tests that trying to load a LoRA adapter after it was evicted from CPU cache fails with the expected - message, as this feature is currently not supported in favor of the performance improvement of not - sending the LoRA weights with every request after the first time. - NOTE: This test assumes the requests are handled in the order they're sent, if that's not true, then this test - may not get any error at all, which would cause it to fail. +def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache(): + """Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single + llm.generate call, that's repeated twice. """ # noqa: D205 - - def _check_contains_expected_message(stdout: str, stderr: str): - note_in_message = "Note that currently a request with LoRA task that was already loaded is sent" \ - " without its LoRA weights to save its serialization, copy and deserialization, so if this" \ - " LoRA task was evicted from LoRA CPU cache, then its reuse is currently not supported." - return note_in_message in stderr - - lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], - max_lora_rank=8, - max_loras=max_loras, - max_cpu_loras=max_cpu_loras) - with EnvVarsContextManager({"TLLM_WORKER_USE_SINGLE_PROCESS": "1"}): - child_stdout, child_stderr = run_function_in_sub_process( - target=check_llama_7b_multi_unique_lora_adapters_from_request, - args=(lora_adapter_count_per_call, repeat_calls, repeats_per_call, - LLM), - kwargs={ - "lora_config": lora_config, - # Disable CUDA graph - # TODO: remove this once we have a proper fix for CUDA graph in LoRA - "cuda_graph_config": None - }, - stop_waiting_criteria=_check_contains_expected_message) - - assert _check_contains_expected_message(child_stdout, child_stderr) + _check_llama_7b_multi_lora_evict_load_new_adapters( + lora_adapter_count_per_call=[2], + max_loras=1, + max_cpu_loras=2, + repeat_calls=2, + repeats_per_call=3) +@skip_gpu_memory_less_than_40gb +def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache(): + """Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU + cache size < LoRA CPU cache size. + """ # noqa: D205 + _check_llama_7b_multi_lora_evict_load_new_adapters( + lora_adapter_count_per_call=[2, 2, 2], + max_loras=1, + max_cpu_loras=3, + repeat_calls=1, + repeats_per_call=1) + + +@skip_gpu_memory_less_than_40gb +def test_llama_7b_multi_lora_read_from_cache_after_insert(): + """Test that loading and then using the same adapters loaded in cache works.""" + _check_llama_7b_multi_lora_evict_load_new_adapters( + lora_adapter_count_per_call=[3], + max_loras=3, + max_cpu_loras=3, + repeat_calls=2, + repeats_per_call=1) + + +@skip_gpu_memory_less_than_40gb +def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_cache( +): + """Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU + cache over multiple llm.generate call repeated twice (two calls with the same requests): + At the end of the 1st llm.generate call: + The LoRA caches should contain adapters 1, 2 and shouldn't contain adapter 0 (it should have been evicted). + So in the 2nd call, the worker should: + - Send req0 with adapter 0 weights (because it was previously evicted) + - Send the other two requests without their adapter weights as they're already in LoRA CPU cache + Then, handling of req0 that has weights but not in the cache should evict one of the other two adapters from + the cache, causing that evicted adapter's request to again load its weights from the file system, as they + aren't with the request and aren't in LoRA cache. + """ # noqa: D205 + _check_llama_7b_multi_lora_evict_load_new_adapters( + lora_adapter_count_per_call=[3], + max_loras=2, + max_cpu_loras=2, + repeat_calls=2, + repeats_per_call=1) + + +@skip_gpu_memory_less_than_40gb def test_llama_7b_peft_cache_config_affects_peft_cache_size(): """Tests that LLM arg of peft_cache_config affects the peft cache sizes. @@ -436,6 +427,7 @@ def test_llama_7b_peft_cache_config_affects_peft_cache_size(): cuda_graph_config=None) +@skip_gpu_memory_less_than_40gb def test_llama_7b_lora_config_overrides_peft_cache_config(): """Tests that cache size args in lora_config LLM arg override the cache size parameters in peft_cache_config LLM arg. diff --git a/tests/unittest/utils/util.py b/tests/unittest/utils/util.py index 7d5c90833a..cbb483b608 100644 --- a/tests/unittest/utils/util.py +++ b/tests/unittest/utils/util.py @@ -1,13 +1,9 @@ -import multiprocessing import os -import sys -import time import unittest from contextlib import contextmanager from difflib import SequenceMatcher -from multiprocessing.connection import Connection from pathlib import Path -from typing import Any, Callable, Generator, Mapping, Tuple +from typing import Any, Generator import pynvml import pytest @@ -425,90 +421,3 @@ def duplicate_list_to_length(list: list[Any], target_length: int) -> list[Any]: if remain != 0: duplicated_list += list[:remain] return duplicated_list - - -def _target_wrapper(target: Callable, stdout_pipe: Connection, - stderr_pipe: Connection, *args, **kwargs) -> None: - - class PipeWriter: - - def __init__(self, conn: Connection): - self.conn = conn - - def write(self, s: str): - self.conn.send_bytes(s.encode("UTF8")) - - def flush(self): - pass - - sys.stdout = PipeWriter(stdout_pipe) - sys.stderr = PipeWriter(stderr_pipe) - target(*args, **kwargs) - - -def run_function_in_sub_process(target: Callable, - args: tuple, - kwargs: Mapping[str, Any], - stop_waiting_criteria: Callable, - poll_interval_seconds: int = 5, - timeout_seconds: int = 240) -> Tuple[str, str]: - multiprocessing.set_start_method("spawn", force=True) - parent_stdout_pipe, child_stdout_pipe = multiprocessing.Pipe() - parent_stderr_pipe, child_stderr_pipe = multiprocessing.Pipe() - child_process = multiprocessing.Process( - target=_target_wrapper, - args=[target, child_stdout_pipe, child_stderr_pipe] + list(args), - kwargs=kwargs) - child_process.start() - child_stdout_pipe.close() - child_stderr_pipe.close() - - def _read_from_pipe(pipe: Connection): - out = "" - while pipe.poll(timeout=0.1): - try: - out += pipe.recv_bytes().decode("UTF8") - except Exception: - break - return out - - child_stdout = "" - child_stderr = "" - try: - total_waiting_seconds = 0 - while child_process.is_alive( - ) and total_waiting_seconds < timeout_seconds: - child_stdout += _read_from_pipe(parent_stdout_pipe) - child_stderr += _read_from_pipe(parent_stderr_pipe) - if stop_waiting_criteria(child_stdout, child_stderr): - break - time.sleep(poll_interval_seconds) - total_waiting_seconds += poll_interval_seconds - finally: - parent_stdout_pipe.close() - parent_stderr_pipe.close() - if child_process.is_alive(): - child_process.terminate() - - assert total_waiting_seconds < timeout_seconds, "Reached timeout while waiting for target" - return child_stdout, child_stderr - - -class EnvVarsContextManager: - - def __init__(self, new_env_vars: dict[str, str]): - self._env_vars = new_env_vars - self._original_value = None - - def __enter__(self): - self._original_vars = { - var_name: os.environ[var_name] - for var_name in self._env_vars.keys() if var_name in os.environ - } - os.environ.update(self._env_vars) - - def __exit__(self, type, value, traceback): - os.environ.update(self._original_vars) - for var_name in self._env_vars.keys(): - if var_name not in self._original_vars: - os.environ.pop(var_name) From 6c1f7d8b9182cfe933ac3126bdabc962cb9eb961 Mon Sep 17 00:00:00 2001 From: ruodil <200874449+ruodil@users.noreply.github.com> Date: Thu, 7 Aug 2025 14:47:10 +0800 Subject: [PATCH 020/186] [None][test] correct test-db context for perf yaml file (#6686) Signed-off-by: ruodil <200874449+ruodil@users.noreply.github.com> --- tests/integration/test_lists/qa/llm_perf_cluster.yml | 2 +- tests/integration/test_lists/qa/llm_perf_full.yml | 2 +- tests/integration/test_lists/qa/llm_perf_sanity.yml | 2 +- tests/integration/test_lists/qa/llm_trt_integration_perf.yml | 2 +- .../test_lists/qa/llm_trt_integration_perf_sanity.yml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_lists/qa/llm_perf_cluster.yml b/tests/integration/test_lists/qa/llm_perf_cluster.yml index 47877a3fcc..9760ed21d7 100644 --- a/tests/integration/test_lists/qa/llm_perf_cluster.yml +++ b/tests/integration/test_lists/qa/llm_perf_cluster.yml @@ -1,5 +1,5 @@ version: 0.0.1 -trt_llm_release_perf_cluster_test: +llm_perf_cluster: - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/qa/llm_perf_full.yml b/tests/integration/test_lists/qa/llm_perf_full.yml index 6573c2dd9a..01e6df1647 100644 --- a/tests/integration/test_lists/qa/llm_perf_full.yml +++ b/tests/integration/test_lists/qa/llm_perf_full.yml @@ -1,5 +1,5 @@ version: 0.0.1 -trt_llm_release_perf_test: +llm_perf_full: # one gpu test - condition: ranges: diff --git a/tests/integration/test_lists/qa/llm_perf_sanity.yml b/tests/integration/test_lists/qa/llm_perf_sanity.yml index bfde42d04c..b7293e74b2 100644 --- a/tests/integration/test_lists/qa/llm_perf_sanity.yml +++ b/tests/integration/test_lists/qa/llm_perf_sanity.yml @@ -1,5 +1,5 @@ version: 0.0.1 -trt_llm_release_perf_sanity_test: +llm_perf_sanity: - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/qa/llm_trt_integration_perf.yml b/tests/integration/test_lists/qa/llm_trt_integration_perf.yml index 1d2e3e0150..4841feacc5 100644 --- a/tests/integration/test_lists/qa/llm_trt_integration_perf.yml +++ b/tests/integration/test_lists/qa/llm_trt_integration_perf.yml @@ -1,5 +1,5 @@ version: 0.0.1 -trt_llm_integration_perf_test: +llm_trt_integration_perf: - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/qa/llm_trt_integration_perf_sanity.yml b/tests/integration/test_lists/qa/llm_trt_integration_perf_sanity.yml index 59cf7474a0..96152af29a 100644 --- a/tests/integration/test_lists/qa/llm_trt_integration_perf_sanity.yml +++ b/tests/integration/test_lists/qa/llm_trt_integration_perf_sanity.yml @@ -1,5 +1,5 @@ version: 0.0.1 -trt_llm_integration_perf_sanity_test: +llm_trt_integration_perf_sanity: - condition: ranges: system_gpu_count: From 8207d5fd3996462f4f416f66bc34ee9f1d859df5 Mon Sep 17 00:00:00 2001 From: hlu1 <14827759+hlu1@users.noreply.github.com> Date: Thu, 7 Aug 2025 00:04:18 -0700 Subject: [PATCH 021/186] [None] [feat] Add model gpt-oss (#6645) Signed-off-by: Hao Lu <14827759+hlu1@users.noreply.github.com> --- .../tensorrt_llm/common/quantization.h | 73 +- cpp/kernels/fmha_v2/fmha_test.py | 6 +- .../fmha_v2/src/fmha/warpspec/compute.h | 8 +- .../fmha_v2/src/fmha/warpspec/epilogue.h | 135 +- .../fmha_v2/src/fused_multihead_attention.cpp | 165 +- .../fmha_v2/src/fused_multihead_attention.h | 3 + ...sed_multihead_attention_demo_bert_params.h | 2 + .../src/fused_multihead_cross_attention.cpp | 86 +- cpp/kernels/fmha_v2/src/softmax_bf16.cu | 9 +- cpp/kernels/fmha_v2/src/softmax_fp16.cu | 9 +- cpp/kernels/fmha_v2/src/softmax_fp32.cu | 9 +- cpp/kernels/fmha_v2/src/softmax_fp8.cu | 10 +- cpp/kernels/fmha_v2/src/softmax_impl.h | 34 +- cpp/kernels/fmha_v2/src/softmax_int8.cu | 10 +- cpp/kernels/xqa/mha.cu | 33 +- cpp/kernels/xqa/mha.h | 2 + cpp/kernels/xqa/mha_sm90.cu | 59 +- cpp/kernels/xqa/mla_sm120.cu | 7 +- cpp/kernels/xqa/test/refAttention.cpp | 32 +- cpp/kernels/xqa/test/refAttention.h | 4 +- cpp/kernels/xqa/test/test.cpp | 45 +- .../mixtureOfExpertsBackendBenchmarkFixture.h | 5 +- cpp/tensorrt_llm/common/attentionOp.cpp | 10 + cpp/tensorrt_llm/common/attentionOp.h | 2 + .../detail/collective/mixed_input_utils.hpp | 107 +- ...a_gmma_rs_warpspecialized_mixed_input_.hpp | 62 +- .../allReduceFusionKernels.cu | 6 +- .../allReduceFusionKernels.h | 2 +- .../mnnvlTwoShotAllreduceKernels.cu | 144 +- .../mnnvlTwoShotAllreduceKernels.h | 2 + .../moeAllReduceFusionKernels.cu | 4 +- .../moeAllReduceFusionKernels.h | 2 +- .../fmha_v2_bf16_128_32_ldgsts_sm90.cubin.cpp | 3 - .../fmha_v2_bf16_128_64_ldgsts_sm90.cubin.cpp | 3 - ...28_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ..._k_v_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...6_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp | 4 +- ...8_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp | 4 +- ...16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ...q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...d_kv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ..._128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...ntion_bf16_64_128_S_qkv_128_sm90.cubin.cpp | 2 +- ...4_128_S_qkv_128_softcapping_sm90.cubin.cpp | 2 +- ..._qkv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...f16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp | 4 +- ..._bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp | 2 +- ...ention_bf16_64_32_S_qkv_128_sm89.cubin.cpp | 2 +- ...ention_bf16_64_32_S_qkv_128_sm90.cubin.cpp | 2 +- ...64_32_S_qkv_128_softcapping_sm90.cubin.cpp | 2 +- ...m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ...q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...d_kv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ..._256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...4_64_256_output_bf16_tma_ws_sm90.cubin.cpp | 4 +- ..._qkv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp | 4 +- ...e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp | 2 +- ...e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp | 2 +- ...p32_128_128_S_q_paged_kv_32_sm89.cubin.cpp | 2 +- ...p32_128_128_S_q_paged_kv_40_sm89.cubin.cpp | 2 +- ...p32_128_128_S_q_paged_kv_48_sm89.cubin.cpp | 2 +- ...p32_128_128_S_q_paged_kv_64_sm89.cubin.cpp | 2 +- ..._e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp | 2 +- ..._e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp | 2 +- ..._e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp | 2 +- ..._e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp | 2 +- ..._e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp | 2 +- ...n_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp | 2 +- ...fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp | 2 +- ...fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp | 2 +- ...fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp | 2 +- ..._q_paged_kv_192_output_bf16_sm89.cubin.cpp | 2 +- ...fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp | 2 +- ...fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp | 2 +- ..._fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp | 2 +- ..._fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp | 2 +- ..._fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp | 2 +- ...n_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp | 2 +- ...8_sage_64_32_32_output_bf16_sm89.cubin.cpp | 2 +- ...8_sage_64_32_32_output_fp16_sm89.cubin.cpp | 2 +- ...n_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp | 2 +- ...n_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp | 2 +- ...64_32_S_qkv_192_output_bf16_sm89.cubin.cpp | 2 +- ...n_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp | 2 +- ...n_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp | 2 +- ...on_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp | 2 +- ...0_sage_64_32_32_output_bf16_sm89.cubin.cpp | 2 +- ...0_sage_64_32_32_output_fp16_sm89.cubin.cpp | 2 +- ...on_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp | 2 +- ...on_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp | 2 +- ...aged_kv_192x128_output_bf16_sm89.cubin.cpp | 2 +- ..._64_64_S_q_paged_kv_192x128_sm89.cubin.cpp | 2 +- ...aged_kv_576x512_output_bf16_sm89.cubin.cpp | 2 +- ..._64_64_S_q_paged_kv_576x512_sm89.cubin.cpp | 2 +- ...4_S_qkv_192x128_output_bf16_sm89.cubin.cpp | 2 +- ...m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp | 2 +- ...p16_128_128_S_q_paged_kv_64_sm80.cubin.cpp | 2 +- ...28_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ..._k_v_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...6_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp | 4 +- ...8_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp | 4 +- ...16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ...q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...p16_64_128_S_q_paged_kv_128_sm80.cubin.cpp | 2 +- ...d_kv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ..._128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...ntion_fp16_64_128_S_qkv_128_sm90.cubin.cpp | 2 +- ...4_128_S_qkv_128_softcapping_sm90.cubin.cpp | 2 +- ..._qkv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...p16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp | 4 +- ...ention_fp16_64_32_S_qkv_128_sm90.cubin.cpp | 2 +- ...64_32_S_qkv_128_softcapping_sm90.cubin.cpp | 2 +- ...28_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ..._k_v_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...2_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp | 4 +- ...8_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp | 4 +- ...32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ...q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ...d_kv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp | 4 +- ..._128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp | 4 +- ..._fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp | 2 +- ...4_128_S_qkv_128_softcapping_sm90.cubin.cpp | 2 +- ..._qkv_128_softcapping_tma_ws_sm90.cubin.cpp | 4 +- ...p32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp | 4 +- ...n_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp | 2 +- ...64_32_S_qkv_128_softcapping_sm90.cubin.cpp | 2 +- .../fmha_v2_fp16_128_32_ldgsts_sm90.cubin.cpp | 3 - .../fmha_v2_fp16_128_64_ldgsts_sm90.cubin.cpp | 3 - ..._v2_fp16_fp32_128_32_ldgsts_sm90.cubin.cpp | 3 - ..._v2_fp16_fp32_128_64_ldgsts_sm90.cubin.cpp | 3 - .../fmhaRunner.cpp | 3 + .../fused_multihead_attention_common.h | 4 + .../kernels/cutlass_kernels/CMakeLists.txt | 14 +- .../kernels/cutlass_kernels/include/common.h | 1 + .../include/moe_gemm_kernels.h | 20 +- .../cutlass_kernels/include/moe_kernels.h | 80 +- .../moe_gemm_tma_ws_mixed_input_launcher.inl | 30 +- .../moe_gemm/moe_gemm_kernels_bf16_fp4.cu | 24 + .../moe_gemm/moe_gemm_kernels_fp16_fp4.cu | 22 + .../moe_gemm/moe_gemm_template_dispatch.h | 125 +- ...emm_template_dispatch_tma_ws_mixed_dtype.h | 16 +- .../cutlass_kernels/moe_gemm/moe_kernels.cu | 309 +- .../moe_tma_warp_specialized_traits.h | 4 +- .../python/generate_kernels.py | 36 +- .../kernels/decoderMaskedMultiheadAttention.h | 5 + .../decoderMaskedMultiheadAttentionTemplate.h | 29 +- .../decoderXQAImplJIT/decoderXQAImplJIT.cpp | 7 +- .../xqaParams.h | 1 + .../decoderMaskedMultiheadAttentionUtils.h | 263 +- cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp | 1 + .../fusedLayernormKernels/fp4_converter.cuh | 10 +- ...llm_internal_cutlass_kernels_static.tar.xz | 4 +- .../aarch64-linux-gnu/version.txt | 4 +- .../include/moe_gemm_kernels.h | 13 +- .../include/moe_kernels.h | 48 +- ...llm_internal_cutlass_kernels_static.tar.xz | 4 +- .../x86_64-linux-gnu/version.txt | 4 +- cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu | 3 - cpp/tensorrt_llm/kernels/quantization.cu | 170 +- cpp/tensorrt_llm/kernels/quantization.cuh | 509 +- cpp/tensorrt_llm/kernels/quantization.h | 34 +- .../batchedGemm/KernelRunner.cpp | 54 +- .../batchedGemm/KernelRunner.h | 30 +- .../BatchedGemmInterface.h | 37 +- .../trtllmGen_bmm_export/BatchedGemmOptions.h | 5 +- .../GemmGatedActOptions.h | 9 +- .../trtllmGen_bmm_export/GemmOptions.h | 1 + .../trtllmGen_bmm_export/KernelMetaInfo.h | 16706 +++++++++++++++- .../trtllmGen_bmm_export/KernelParams.h | 1385 +- .../trtllmGen_bmm_export/KernelParamsDecl.h | 498 + .../trtllmGen_bmm_export/config.json | 285 +- ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 - ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 - ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 - ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._s4_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._s4_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._transOut_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._transOut_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._s3_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._s3_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._s4_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._s4_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ..._transOut_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ..._transOut_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._s3_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._s3_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._s4_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._s4_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._transOut_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ..._transOut_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._s3_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._s3_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + .../blockScaleMoe/DevKernel.cu | 11 +- .../blockScaleMoe/DevKernel.h | 5 + .../trtllmGenKernels/blockScaleMoe/runner.cu | 121 +- .../trtllmGenKernels/blockScaleMoe/runner.h | 40 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- .../fmha/cubin/kernelMetaInfo.h | 1482 +- .../trtllmGenKernels/fmha/fmhaKernels.h | 2 +- .../trtllmGenKernels/fmha/fmhaRunnerParams.h | 2 + .../trtllmGenKernels/fmha/kernelParams.h | 3 + .../kernels/userbuffers/userbuffers.cu | 18 +- cpp/tensorrt_llm/kernels/xqaDispatcher.cpp | 1 + cpp/tensorrt_llm/nanobind/bindings.cpp | 7 +- .../mixtureOfExpertsPlugin.cpp | 14 +- .../mixtureOfExperts/mixtureOfExpertsPlugin.h | 1 + .../quantizeToFP4Plugin.cpp | 6 +- .../smoothQuantGemmPlugin.cpp | 2 +- cpp/tensorrt_llm/pybind/bindings.cpp | 5 +- cpp/tensorrt_llm/thop/CMakeLists.txt | 2 + cpp/tensorrt_llm/thop/allreduceOp.cpp | 12 +- cpp/tensorrt_llm/thop/attentionOp.cpp | 36 +- cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp | 8 +- cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp | 29 +- cpp/tensorrt_llm/thop/fp4Op.cpp | 102 +- cpp/tensorrt_llm/thop/fp4Quantize.cpp | 104 +- cpp/tensorrt_llm/thop/fp4Quantize.h | 3 +- .../thop/fp8BatchedGemmTrtllmGen.cpp | 9 +- cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp | 74 +- cpp/tensorrt_llm/thop/fp8Op.cpp | 123 + cpp/tensorrt_llm/thop/fp8Op.h | 41 + .../thop/fp8PerTensorScaleMoe.cpp | 81 +- cpp/tensorrt_llm/thop/moeOp.cpp | 174 +- cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp | 508 + cpp/tensorrt_llm/thop/mxFp8Quantize.cpp | 115 + cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp | 52 + .../kernels/allReduce/allReduceFusionTest.cu | 8 +- .../allReduce/moeAllReduceFusionTest.cu | 4 +- .../kernels/mixtureOfExpertsTest.cu | 156 +- .../kernels/mlaChunkedPrefillTest.cu | 5 +- .../smoothQuant/smoothQuantKernelTest.cpp | 8 +- examples/llm-api/quickstart_advanced.py | 5 +- examples/models/core/gpt_oss/README.md | 145 + .../openai_chat_client_function_calling.py | 191 + requirements.txt | 2 +- tensorrt_llm/_mnnvl_utils.py | 68 +- .../_torch/attention_backend/interface.py | 2 + .../_torch/attention_backend/trtllm.py | 6 + .../auto_deploy/utils/quantization_utils.py | 2 +- .../_torch/custom_ops/cpp_custom_ops.py | 8 +- .../_torch/custom_ops/torch_custom_ops.py | 23 +- .../custom_ops/trtllm_gen_custom_ops.py | 669 + tensorrt_llm/_torch/distributed/ops.py | 93 +- tensorrt_llm/_torch/model_config.py | 240 +- tensorrt_llm/_torch/models/__init__.py | 2 + .../_torch/models/modeling_gpt_oss.py | 912 + .../_torch/models/modeling_mixtral.py | 2 + tensorrt_llm/_torch/models/modeling_utils.py | 20 +- tensorrt_llm/_torch/modules/attention.py | 35 +- tensorrt_llm/_torch/modules/embedding.py | 21 +- .../_torch/modules/fused_moe/__init__.py | 6 +- .../_torch/modules/fused_moe/create_moe.py | 55 +- .../modules/fused_moe/fused_moe_cutlass.py | 212 +- .../modules/fused_moe/fused_moe_triton.py | 1318 ++ .../modules/fused_moe/fused_moe_trtllm_gen.py | 196 +- .../modules/fused_moe/fused_moe_wide_ep.py | 6 +- .../_torch/modules/fused_moe/interface.py | 52 +- .../_torch/modules/fused_moe/quantization.py | 1531 +- .../_torch/modules/fused_moe/routing.py | 297 +- tensorrt_llm/_torch/modules/linear.py | 88 +- tensorrt_llm/_torch/modules/triton_linear.py | 418 + tensorrt_llm/_torch/pyexecutor/llm_request.py | 2 + tensorrt_llm/_torch/pyexecutor/sampler.py | 163 +- tensorrt_llm/_torch/speculative/eagle3.py | 3 +- tensorrt_llm/_torch/speculative/interface.py | 2 +- .../_torch/speculative/model_drafter.py | 10 +- tensorrt_llm/_torch/utils.py | 5 +- tensorrt_llm/bench/dataclasses/reporting.py | 6 +- tensorrt_llm/inputs/registry.py | 47 +- tensorrt_llm/layers/moe.py | 5 +- tensorrt_llm/llmapi/llm_args.py | 7 +- tensorrt_llm/llmapi/llm_utils.py | 19 +- tensorrt_llm/llmapi/tokenizer.py | 6 +- tensorrt_llm/models/gemma/config.py | 4 +- tensorrt_llm/models/gemma/model.py | 2 +- tensorrt_llm/models/modeling_utils.py | 4 +- tensorrt_llm/quantization/layers.py | 6 +- tensorrt_llm/quantization/mode.py | 40 +- tensorrt_llm/quantization/utils/fp4_utils.py | 14 +- tensorrt_llm/sampling_params.py | 15 +- tensorrt_llm/serialization.py | 99 + .../defs/accuracy/references/gsm8k.yaml | 8 + .../defs/accuracy/references/mmlu.yaml | 18 + .../accuracy/test_disaggregated_serving.py | 6 + .../defs/accuracy/test_llm_api_pytorch.py | 205 +- .../defs/disaggregated/test_disaggregated.py | 1 + .../test_lists/qa/llm_function_full.txt | 8 + .../test_lists/qa/llm_function_sanity.txt | 8 + .../test_lists/test-db/l0_b200.yml | 11 +- .../test_lists/test-db/l0_h100.yml | 1 + .../library/test_attention_matcher_hf.py | 3 + .../_torch/modeling/test_modeling_exaone4.py | 2 +- .../_torch/modeling/test_modeling_gpt_oss.py | 89 + .../test_modeling_llama_min_latency.py | 6 + .../modeling/test_modeling_nemotron_nas.py | 1 + .../unittest/_torch/modules/test_fused_moe.py | 1108 +- .../_torch/modules/test_moe_routing.py | 115 +- .../_torch/modules/test_triton_linear.py | 192 + .../_torch/multi_gpu/test_mnnvl_allreduce.py | 16 +- .../_torch/multi_gpu/test_user_buffers.py | 2 +- tests/unittest/_torch/test_custom_ops.py | 4 + .../_torch/thop/test_fp4_bmm_quantize.py | 4 +- .../_torch/thop/test_fp4_gemm_quantize.py | 44 +- tests/unittest/_torch/thop/test_fp4_linear.py | 5 +- .../unittest/_torch/thop/test_fp8_quantize.py | 103 +- tests/unittest/_torch/thop/test_moe.py | 729 +- tests/unittest/_torch/thop/test_scaled_mm.py | 14 +- .../_torch/thop/test_w4a8_mxfp4_mxfp8_gemm.py | 4 +- tests/unittest/bindings/test_bindings_ut.py | 3 +- .../trt/attention/test_gpt_attention.py | 100 +- .../trt/attention/test_gpt_attention_IFB.py | 51 +- tests/unittest/trt/functional/test_moe.py | 2 +- tests/unittest/trt/quantization/test_mode.py | 2 +- tests/unittest/utils/util.py | 15 + triton_backend/requirements.txt | 2 +- 2102 files changed, 33998 insertions(+), 8186 deletions(-) delete mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_32_ldgsts_sm90.cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_64_ldgsts_sm90.cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_32_ldgsts_sm90.cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_64_ldgsts_sm90.cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_32_ldgsts_sm90.cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_64_ldgsts_sm90.cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu create mode 100644 cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp create mode 100644 cpp/tensorrt_llm/thop/mxFp8Quantize.cpp create mode 100644 examples/models/core/gpt_oss/README.md create mode 100644 examples/models/core/gpt_oss/openai_chat_client_function_calling.py create mode 100644 tensorrt_llm/_torch/models/modeling_gpt_oss.py create mode 100755 tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py create mode 100644 tensorrt_llm/_torch/modules/triton_linear.py create mode 100644 tests/unittest/_torch/modeling/test_modeling_gpt_oss.py create mode 100644 tests/unittest/_torch/modules/test_triton_linear.py diff --git a/cpp/include/tensorrt_llm/common/quantization.h b/cpp/include/tensorrt_llm/common/quantization.h index 836faa258f..50aae114e0 100644 --- a/cpp/include/tensorrt_llm/common/quantization.h +++ b/cpp/include/tensorrt_llm/common/quantization.h @@ -122,6 +122,16 @@ public: return QuantMode(BaseType(1u) << 14); } + static constexpr QuantMode w4a8Mxfp4Mxfp8() noexcept + { + return QuantMode(BaseType(1u) << 15); + } + + static constexpr QuantMode w4a16Mxfp4() noexcept + { + return QuantMode(BaseType(1u) << 16); + } + constexpr BaseType value() const noexcept { return mValue; @@ -202,6 +212,16 @@ public: return isSet(w4a8Mxfp4Fp8()); } + constexpr bool hasW4a8Mxfp4Mxfp8() const noexcept + { + return isSet(w4a8Mxfp4Mxfp8()); + } + + constexpr bool hasW4a16Mxfp4() const noexcept + { + return isSet(w4a16Mxfp4()); + } + constexpr bool hasKvCacheQuant() const noexcept { return hasInt8KvCache() || hasFp8KvCache() || hasFp4KvCache(); @@ -209,7 +229,8 @@ public: static constexpr QuantMode fromDescription(bool quantizeWeights, bool quantizeActivations, bool perToken, bool perChannel, bool perGroup, bool useInt4Weights, bool useInt8KvCache, bool useFp8KvCache, bool useFp8Qdq, - bool useFp8RowWise, bool useW4a8QServe, bool useFp4Quant, bool useFp8BlockScales, bool useW4a8Mxfp4Fp8) + bool useFp8RowWise, bool useW4a8QServe, bool useFp4Quant, bool useFp8BlockScales, bool useW4a8Mxfp4Fp8, + bool useW4a8Mxfp4Mxfp8, bool useW4a16Mxfp4) { QuantMode quantMode{}; if (quantizeWeights) @@ -278,25 +299,35 @@ public: quantMode += w4a8Mxfp4Fp8(); } + if (useW4a8Mxfp4Mxfp8) + { + quantMode += w4a8Mxfp4Mxfp8(); + } + + if (useW4a16Mxfp4) + { + quantMode += w4a16Mxfp4(); + } + return quantMode; } static constexpr QuantMode useSmoothQuant(bool perToken = false, bool perChannel = false) { - return fromDescription( - true, true, perToken, perChannel, false, false, false, false, false, false, false, false, false, false); + return fromDescription(true, true, perToken, perChannel, false, false, false, false, false, false, false, false, + false, false, false, false); } static constexpr QuantMode useQServe(bool perGroup) { - return fromDescription( - true, true, false, false, perGroup, true, false, false, false, false, true, false, false, false); + return fromDescription(true, true, false, false, perGroup, true, false, false, false, false, true, false, false, + false, false, false); } static constexpr QuantMode useWeightOnly(bool useInt4Weights = false, bool perGroup = false) { return fromDescription(true, false, false, false, perGroup, useInt4Weights, false, false, false, false, false, - false, false, false); + false, false, false, false, false); } static QuantMode const fromQuantAlgo( @@ -353,28 +384,38 @@ public: } else if (quantAlgo == "FP8") { - quantMode = fromDescription( - false, false, false, false, false, false, false, false, true, false, false, false, false, false); + quantMode = fromDescription(false, false, false, false, false, false, false, false, true, false, false, + false, false, false, false, false); } else if (quantAlgo == "FP8_ROWWISE") { - quantMode = fromDescription( - false, false, true, true, false, false, false, false, false, true, false, false, false, false); + quantMode = fromDescription(false, false, true, true, false, false, false, false, false, true, false, false, + false, false, false, false); } else if (quantAlgo == "FP4") { - quantMode = fromDescription( - false, false, false, false, false, false, false, false, false, false, false, true, false, false); + quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false, + true, false, false, false, false); } else if (quantAlgo == "FP8_BLOCK_SCALES") { - quantMode = fromDescription( - false, false, false, false, false, false, false, false, false, false, false, false, true, false); + quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false, + false, true, false, false, false); } else if (quantAlgo == "W4A8_MXFP4_FP8") { - quantMode = fromDescription( - false, false, false, false, false, false, false, false, false, false, false, false, false, true); + quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false, + false, false, true, false, false); + } + else if (quantAlgo == "W4A8_MXFP4_MXFP8") + { + quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false, + false, false, false, true, false); + } + else if (quantAlgo == "W4A16_MXFP4") + { + quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false, + false, false, false, false, true); } if (kvCacheQuantAlgo == "INT8") diff --git a/cpp/kernels/fmha_v2/fmha_test.py b/cpp/kernels/fmha_v2/fmha_test.py index f9f28978e6..bd255bdd85 100644 --- a/cpp/kernels/fmha_v2/fmha_test.py +++ b/cpp/kernels/fmha_v2/fmha_test.py @@ -50,7 +50,7 @@ def getSMVersion(): ids=["fp16", "bf16", "fp16-fp32", "e4m3"]) @pytest.mark.parametrize('flag', [ "-s-q 128 -paged-kv", "-s-q 63 -paged-kv", "-paged-kv", - "-softcapping-scale-bmm1 30", "-contiguous-q-kv" + "-softcapping-scale-bmm1 30", "-contiguous-q-kv", "-use-attention-sinks" ]) @pytest.mark.parametrize('tiled_kernel', ["", "-force-non-tiled"]) def test_trtllm_flash_attention_fmha(d, s, dtype, flag, tiled_kernel): @@ -117,8 +117,8 @@ def test_trtllm_flash_attention_fmha(d, s, dtype, flag, tiled_kernel): f"bin/fmha.exe -d {d} -h 16 -b 8 -s {s} -min-s 128 -custom-mask -gqa 2 -v {verbose} {dtype} {epsilon} {flag} {tiled_kernel}", shell=True, check=True) - # alibi and softcapping-scale-bmm1 are mutually exclusive. - if '-softcapping-scale-bmm1' not in flag: + # alibi doesn't work with softcapping-scale-bmm1/use-attention-sinks. + if '-softcapping-scale-bmm1' not in flag and '-use-attention-sinks' not in flag: subprocess.run( f"bin/fmha.exe -d {d} -h 16 -b 8 -s {s} -min-s 128 -causal-mask -alibi -v {verbose} {dtype} {epsilon} {flag} {tiled_kernel}", shell=True, diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h index 65e56dbf5d..eed6f852da 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h @@ -326,9 +326,6 @@ struct Compute uint32_t smem_v = __cvta_generic_to_shared(&shared->smem_v[0]); Compute_tile_o ctile_o(0, smem_v); - // BMM2 epilogue - Tile_o_epilogue tile_o_epilogue(params); - // Mutex between two compute groups. OrderedMutexAccessor mutex_accessor(shared->compute_mutex, warpgroup_id, SYNC_BARRIER); // Notify warpgroup 0 to execute HGMMA first (overlap HGMMA and Softmax Math Instructions). @@ -368,6 +365,9 @@ struct Compute sage_scale_row = head_info.bidb * params.h + head_info.bidh; } + // BMM2 epilogue + Tile_o_epilogue tile_o_epilogue(params, head_info); + int q_step_idx = warpgroup_id; // Compute work. @@ -490,7 +490,7 @@ struct Compute if (valid_run) { // Final step's update. - tile_o_epilogue.scale(ctile_o, p_sum); + tile_o_epilogue.scale(ctile_o, p_max, p_sum); // Store o_tile to gmem. gmem_o.store(ctile_o.acc_); } diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h index 217e8c0872..99ea1643cd 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h @@ -454,7 +454,7 @@ struct Softmax_base #pragma unroll for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++) { - uint32_t const scale = float_to_half2(correction_[mi]); + const uint32_t scale = float_to_half2(correction_[mi]); // Assume only N has multiple MMAs (MMAS_M = 1). // MMAS_N > 1 when N dimension is split. @@ -477,9 +477,15 @@ struct Softmax_base } // BMM1 scale. - uint32_t const scale_bmm1_; + const uint32_t scale_bmm1_; // BMM1 softcapping scale. float const softcapping_scale_bmm1_; + + // The sliding window size. + int const sliding_window_size_; + // The log2 attention chunk size. + int const log2_chunked_attention_size_; + // The thread idx in the warp group. int tidx_; // The col index for the mma thread layout. @@ -487,15 +493,10 @@ struct Softmax_base // The row index for the mma thread layout. int quad_row_; - // The sliding window size. - int const sliding_window_size_; - // The log2 attention chunk size. - int const log2_chunked_attention_size_; - // The packed mask ptr. uint32_t const* packed_mask_ptr_; // The packed mask k-dim stride in bytes; - int64_t const params_packed_mask_stride_in_bytes_; + const int64_t params_packed_mask_stride_in_bytes_; // Unpacked BMM1 output buffer. float elt_[Mma_tile_p::CORES_M][Mma_tile_p::CORES_N * 2]; @@ -1072,20 +1073,53 @@ struct Tile_o_epilogue_base // The MMA tile for the BMM2. using Mma_tile_o = typename Kernel_traits::Mma_tile_o; - template - inline __device__ Tile_o_epilogue_base(Params const& params) + // Apply the exp2f optimization (fuse bmm1_scale and -max into FMAs). + enum { - ; // nothing to construct. + EXP2F_OPTIMIZATION = Kernel_traits::EXP2F_OPTIMIZATION }; + template + inline __device__ Tile_o_epilogue_base(Params const& params, Block_info& block_info) + { + has_attention_sink_ = params.attention_sinks != nullptr; + head_idx_ = block_info.bidh; + attention_sink_ = has_attention_sink_ ? params.attention_sinks[block_info.bidh] : 0.f; + // It is only need when the exp2f optimization is enabled, so params.scale_bmm1 is always float. + scale_bmm1_f_ = reinterpret_cast(params.scale_bmm1_d ? *params.scale_bmm1_d : params.scale_bmm1); + }; + + // The attention sinks. + inline __device__ void add_attention_sink(float& sum, float max) + { + if (has_attention_sink_) + { + // The global max needs to be scaled by the bmm1 scale if exp2f optimization is enabled. + if constexpr (EXP2F_OPTIMIZATION) + { + sum += exp2f(attention_sink_ * M_LOG2E - max * scale_bmm1_f_); + } + else + { + sum += expf(attention_sink_ - max); + } + } + } + // Scale ctile_o output by 1/sum - inline __device__ void scale(Compute_tile_o& ctile_o, float (&global_sum)[Mma_tile_o::CORES_M]) + inline __device__ void scale( + Compute_tile_o& ctile_o, float (&global_max)[Mma_tile_o::CORES_M], float (&global_sum)[Mma_tile_o::CORES_M]) { // Final step's update. #pragma unroll for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++) { - global_sum[mi] = global_sum[mi] == 0.f ? 1.f : 1.0f / global_sum[mi]; + // The global sum. + float global_sum_mi = global_sum[mi]; + // Add the attention sink to the global sum. + add_attention_sink(global_sum_mi, global_max[mi]); + // The scale. + float scale = global_sum_mi == 0.f ? 1.f : 1.0f / global_sum_mi; // Assume only N has multiple MMAs (MMAS_M = 1). #pragma unroll @@ -1096,12 +1130,21 @@ struct Tile_o_epilogue_base { float& reg0 = ctile_o.acc_[0][mma_ni].elt(2 * ni * Mma_tile_o::CORES_M + 2 * mi); float& reg1 = ctile_o.acc_[0][mma_ni].elt(2 * ni * Mma_tile_o::CORES_M + 2 * mi + 1); - reg0 *= global_sum[mi]; - reg1 *= global_sum[mi]; + reg0 *= scale; + reg1 *= scale; } } } } + + // Whether the attention sink is enabled. + bool has_attention_sink_ = false; + // The attention sink value. + float attention_sink_ = 0.f; + // The float scale of bmm1 outputs. + float scale_bmm1_f_ = 1.f; + // The head idx. + int head_idx_ = 0; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1138,14 +1181,21 @@ struct Tile_o_epilogue using Base::Tile_o_epilogue_base; // Scale ctile_o output by 1/sum - inline __device__ void scale(Compute_tile_o& ctile_o, float (&global_sum)[Mma_tile_o::CORES_M]) + inline __device__ void scale( + Compute_tile_o& ctile_o, float (&global_max)[Mma_tile_o::CORES_M], float (&global_sum)[Mma_tile_o::CORES_M]) { // Final step's update. #pragma unroll for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++) { - global_sum[mi] = global_sum[mi] == 0.f ? 1.f : 1.0f / global_sum[mi]; - uint32_t const scale = float_to_half2(global_sum[mi]); + // The global sum. + float global_sum_mi = global_sum[mi]; + // Add the attention sink to the global sum. + this->add_attention_sink(global_sum_mi, global_max[mi]); + // The scale. + float scale = global_sum_mi == 0.f ? 1.f : 1.0f / global_sum_mi; + // The scale. + const uint32_t scale_h = float_to_half2(scale); // Assume only N has multiple MMAs (MMAS_M = 1). #pragma unroll @@ -1155,7 +1205,7 @@ struct Tile_o_epilogue for (int ni = 0; ni < Mma_tile_o::CORES_N; ni++) { uint32_t& reg = ctile_o.acc_[0][mma_ni].reg(ni * Mma_tile_o::CORES_M + mi); - reg = hmul2(reg, scale); + reg = hmul2(reg, scale_h); } } } @@ -1215,27 +1265,58 @@ struct Tile_o_epilogue // The MMA tile for the BMM2. using Mma_tile_o = typename Base::Mma_tile_o; + // Apply the exp2f optimization (fuse bmm1_scale and -max into FMAs). + enum + { + EXP2F_OPTIMIZATION = Base::EXP2F_OPTIMIZATION + }; + // Ctor. - template - inline __device__ Tile_o_epilogue(Params const& params) - : Base(params) + template + inline __device__ Tile_o_epilogue(Params const& params, Block_info& block_info) + : Base(params, block_info) , scale_bmm2_(*params.scale_bmm2_d) { } + // Add the attention sink to the global sum. + inline __device__ void add_attention_sink(float& sum, float max) + { + if (this->has_attention_sink_) + { + // The global max needs to be scaled by the bmm1 scale if exp2f optimization is enabled. + // Take the log2f(Traits_o::SOFTMAX_FP_QUANT_SCALE) into account as the same scale has been applied to sum. + float quant_scale_in_log2 = log2f(Traits_o::SOFTMAX_FP_QUANT_SCALE); + if constexpr (EXP2F_OPTIMIZATION) + { + sum += exp2f(this->attention_sink_ * M_LOG2E - max * this->scale_bmm1_f_ + quant_scale_in_log2); + } + else + { + sum += expf(this->attention_sink_ - max + quant_scale_in_log2); + } + } + } + // Scale ctile_o output by 1/sum - inline __device__ void scale(Compute_tile_o& ctile_o, float (&global_sum)[Mma_tile_o::CORES_M]) + inline __device__ void scale( + Compute_tile_o& ctile_o, float (&global_max)[Mma_tile_o::CORES_M], float (&global_sum)[Mma_tile_o::CORES_M]) { // Final step's update. #pragma unroll for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++) { + // The global sum. + float global_sum_mi = global_sum[mi]; + // Add the attention sink to the global sum. + add_attention_sink(global_sum_mi, global_max[mi]); #ifdef UNIFIED_EPILOGUE_SCALE // Descaling factor float const scale_bmm2_f_ = reinterpret_cast(scale_bmm2_); - global_sum[mi] = global_sum[mi] == 0.f ? scale_bmm2_f_ : scale_bmm2_f_ / global_sum[mi]; + // The scale. + float scale = global_sum_mi == 0.f ? scale_bmm2_f_ : scale_bmm2_f_ / global_sum_mi; #else - global_sum[mi] = global_sum[mi] == 0.f ? 1.0f : 1.0f / global_sum[mi]; + float scale = global_sum_mi == 0.f ? 1.0f : 1.0f / global_sum_mi; #endif // Assume only N has multiple MMAs (MMAS_M = 1). #pragma unroll @@ -1246,8 +1327,8 @@ struct Tile_o_epilogue { float& reg0 = ctile_o.acc_[0][mma_ni].elt(2 * ni * Mma_tile_o::CORES_M + 2 * mi); float& reg1 = ctile_o.acc_[0][mma_ni].elt(2 * ni * Mma_tile_o::CORES_M + 2 * mi + 1); - reg0 *= global_sum[mi]; - reg1 *= global_sum[mi]; + reg0 *= scale; + reg1 *= scale; } } } diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp b/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp index 6d9811ac07..6cf52fcf4c 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp +++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp @@ -29,33 +29,36 @@ using Kv_block_array = fmha::Kv_block_array; //////////////////////////////////////////////////////////////////////////////////////////////////// -void run_softmax_fp32(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d, - int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi); - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -void run_softmax_e4m3(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d, - int s_inner, int s_outer, int b, int h, float scale_softmax, float softcapping_scale_bmm1, int warps_n, +void run_softmax_fp32(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi); //////////////////////////////////////////////////////////////////////////////////////////////////// -void run_softmax_fp16(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d, - int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi); +void run_softmax_e4m3(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float scale_softmax, float softcapping_scale_bmm1, + int warps_n, bool has_alibi); //////////////////////////////////////////////////////////////////////////////////////////////////// -void run_softmax_bf16(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d, - int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi); - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -void run_softmax_int8(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d, - int s_inner, int s_outer, int b, int h, float scale_i2f, float scale_f2i, float softcapping_scale_bmm1, int warps_n, +void run_softmax_fp16(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi); //////////////////////////////////////////////////////////////////////////////////////////////////// +void run_softmax_bf16(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, + bool has_alibi); + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +void run_softmax_int8(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float scale_i2f, float scale_f2i, + float softcapping_scale_bmm1, int warps_n, bool has_alibi); + +//////////////////////////////////////////////////////////////////////////////////////////////////// + void run_conversion_int32_to_int8(void* dst, void const* src, int s, int b, int h, int d, float scale); //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -81,11 +84,11 @@ void run_sage_quant(unsigned int batch_size, unsigned int head_num, unsigned int //////////////////////////////////////////////////////////////////////////////////////////////////// -void ground_truth(RefBMM& bmm1, RefBMM& bmm2, Data_type const data_type, Data_type const acc_type, +void ground_truth(RefBMM& bmm1, RefBMM& bmm2, const Data_type data_type, const Data_type acc_type, float const scale_bmm1, float const scale_softmax, float const scale_bmm2, float const softcapping_scale_bmm1, - void* qkv_d, void* vt_d, void* mask_d, void* p_d, void* s_d, void* tmp_d, void* o_d, void* softmax_sum_d, - void* cu_q_seqlens_d, size_t const b, size_t const s, size_t const h, size_t const d, size_t const dv, - int const runs, int const warps_m, int const warps_n, bool const has_alibi) + void* qkv_d, void* vt_d, void* mask_d, void* attention_sinks_d, void* p_d, void* s_d, void* tmp_d, void* o_d, + void* softmax_sum_d, void* cu_q_seqlens_d, const size_t b, const size_t s, const size_t h, const size_t d, + const size_t dv, int const runs, int const warps_m, int const warps_n, bool const has_alibi) { cudaStream_t stream = 0; @@ -106,28 +109,28 @@ void ground_truth(RefBMM& bmm1, RefBMM& bmm2, Data_type const data_type, Data_ty // Softmax. if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP16) { - run_softmax_fp16(s_d, p_d, mask_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, softcapping_scale_bmm1, - warps_n, has_alibi); + run_softmax_fp16(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, + softcapping_scale_bmm1, warps_n, has_alibi); } else if (data_type == DATA_TYPE_BF16 && acc_type == DATA_TYPE_FP32) { - run_softmax_bf16(s_d, p_d, mask_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, softcapping_scale_bmm1, - warps_n, has_alibi); + run_softmax_bf16(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, + softcapping_scale_bmm1, warps_n, has_alibi); } else if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP32) { - run_softmax_fp32(s_d, p_d, mask_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, softcapping_scale_bmm1, - warps_n, has_alibi); + run_softmax_fp32(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, + softcapping_scale_bmm1, warps_n, has_alibi); } else if (data_type == DATA_TYPE_E4M3 && acc_type == DATA_TYPE_FP32) { - run_softmax_e4m3(s_d, p_d, mask_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, scale_softmax, - softcapping_scale_bmm1, warps_n, has_alibi); + run_softmax_e4m3(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, + scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi); } else if (data_type == DATA_TYPE_INT8 && acc_type == DATA_TYPE_INT32) { - run_softmax_int8(s_d, p_d, mask_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, scale_bmm1, scale_softmax, - softcapping_scale_bmm1, warps_n, has_alibi); + run_softmax_int8(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, scale_bmm1, + scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi); } else { @@ -179,7 +182,7 @@ static inline void set_params(bert::Fused_multihead_attention_params_v1& params, // types Data_type data_type, Data_type acc_type, // sizes - size_t const b, size_t const s, size_t const h, size_t const d, size_t const packed_mask_stride, + const size_t b, const size_t s, const size_t h, const size_t d, const size_t packed_mask_stride, // device pointers void* qkv_d, void* packed_mask_d, void* o_d, void* p_d, void* s_d, // scale factors @@ -235,17 +238,17 @@ static inline void set_params(bert::Fused_multihead_attention_params_v1& params, //////////////////////////////////////////////////////////////////////////////////////////////////// -static inline void set_params(bert::Fused_multihead_attention_params_v2& params, Launch_params const launch_params, +static inline void set_params(bert::Fused_multihead_attention_params_v2& params, const Launch_params launch_params, // types Data_type data_type, Data_type acc_type, Data_type output_dtype, // attention input layout Attention_input_layout input_layout, // sizes - size_t const b, size_t const s_q, size_t const s_kv, size_t const h, size_t const h_kv, size_t const d, - size_t const dv, size_t const total, const size_t num_grouped_heads, const size_t sliding_window_size, + const size_t b, const size_t s_q, const size_t s_kv, const size_t h, const size_t h_kv, const size_t d, + const size_t dv, const size_t total, const size_t num_grouped_heads, const size_t sliding_window_size, const size_t chunked_attention_size, // paged kv cache block size. - size_t const tokens_per_block, + const size_t tokens_per_block, // device pointers void* qkv_packed_d, // contiguous q. @@ -261,8 +264,10 @@ static inline void set_params(bert::Fused_multihead_attention_params_v2& params, // offsets for different blocks in terms of the start address. int32_t* paged_block_offsets, // mask input. - void* packed_mask_d, void* cu_mask_rows_d, void* cu_kv_seqlens_d, void* cu_q_seqlens_d, void* o_packed_d, void* p_d, - void* s_d, void* softmax_stats_d, void* scale_bmm2_d, + void* packed_mask_d, void* cu_mask_rows_d, + // attention sinks. + void* attention_sinks_d, void* cu_kv_seqlens_d, void* cu_q_seqlens_d, void* o_packed_d, void* p_d, void* s_d, + void* softmax_stats_d, void* scale_bmm2_d, // scale factors float const scale_bmm1, float const scale_softmax, float const scale_bmm2, float const softcapping_scale_bmm1, // flags @@ -329,6 +334,9 @@ static inline void set_params(bert::Fused_multihead_attention_params_v2& params, // The N dimension has to be aligned. params.packed_mask_stride_in_bytes = (align_to(int64_t(s_kv), int64_t(fmha::FLASH_ATTEN_MASK_N_ALIGNMENT))) / 8; + // Attention sinks. + params.attention_sinks = reinterpret_cast(attention_sinks_d); + #if defined(STORE_P) params.p_ptr = p_d; params.p_stride_in_bytes = get_size_in_bytes(b * h * s_kv, acc_type); @@ -412,13 +420,13 @@ static inline void set_params(bert::Fused_multihead_attention_params_v2& params, //////////////////////////////////////////////////////////////////////////////////////////////////// -static inline void determine_launch_params(Launch_params& launch_params, Data_type data_type, int sm, size_t const s, - size_t const d, Attention_mask_type const attention_mask_type, Attention_input_layout const input_layout, +static inline void determine_launch_params(Launch_params& launch_params, Data_type data_type, int sm, const size_t s, + const size_t d, const Attention_mask_type attention_mask_type, const Attention_input_layout input_layout, bool const interleaved, bool const ignore_b1opt, bool const force_unroll, bool const use_tma, bool const force_non_flash_attention, bool const force_non_warp_specialization, bool const force_non_granular_tiling, bool const force_fp32_acc, // device props - cudaDeviceProp const props) + const cudaDeviceProp props) { // Set launch params to choose kernels @@ -573,6 +581,9 @@ int main(int argc, char** argv) // SageAttention block sizes int sage_block_size_q = 0, sage_block_size_k = 0, sage_block_size_v = 0; + // Use attention sinks (added to the denominator of softmax) + bool use_attention_sinks = false; + // Read the parameters from the command-line. for (int ii = 1; ii < argc; ++ii) { @@ -865,13 +876,16 @@ int main(int argc, char** argv) { sage_block_size_v = strtol(argv[ii], nullptr, 10); } + else if (!strcmp(argv[ii], "-use-attention-sinks")) + { + use_attention_sinks = true; + } else { fprintf(stderr, "Unrecognized option: %s. Aborting!\n", argv[ii]); return -1; } } - if (save_softmax == true) { if (input_layout != Attention_input_layout::CONTIGUOUS_Q_KV) @@ -1043,11 +1057,11 @@ int main(int argc, char** argv) force_non_granular_tiling, force_fp32_acc, props); // The Q, K and V matrices are packed into one big matrix of size S x B x H x 3 x D. - size_t const qkv_size = s * b * h * (2 * d + dv); + const size_t qkv_size = s * b * h * (2 * d + dv); // Allocate on the host. float* qkv_h = (float*) malloc(qkv_size * sizeof(float)); // The size in bytes. - size_t const qkv_size_in_bytes = get_size_in_bytes(qkv_size, data_type); + const size_t qkv_size_in_bytes = get_size_in_bytes(qkv_size, data_type); // Allocate on the device. void *qkv_sbh3d_d = nullptr, *qkv_bsh3d_d = nullptr; FMHA_CHECK_CUDA(cudaMalloc(&qkv_sbh3d_d, qkv_size_in_bytes)); @@ -1057,7 +1071,7 @@ int main(int argc, char** argv) // The shape is [B, 2, S, H, D]. const size_t kv_size = b * s * h_kv * (d + dv); // The size in bytes. - size_t const kv_size_in_bytes = get_size_in_bytes(kv_size, data_type); + const size_t kv_size_in_bytes = get_size_in_bytes(kv_size, data_type); // Allocate on the host. void* contiguous_kv_h = malloc(kv_size_in_bytes); // Memset the buffer. @@ -1071,13 +1085,13 @@ int main(int argc, char** argv) void** kv_cache_ptrs_h = nullptr; void* kv_cache_pool_ptr = nullptr; int32_t *kv_cache_block_offsets_h, *kv_cache_block_offsets_d = nullptr; - size_t const max_blocks_per_seq = (s + tokens_per_block - 1) / tokens_per_block; - size_t const num_total_blocks = b * 2 * max_blocks_per_seq; + const size_t max_blocks_per_seq = (s + tokens_per_block - 1) / tokens_per_block; + const size_t num_total_blocks = b * 2 * max_blocks_per_seq; kv_cache_ptrs_h = (void**) malloc(num_total_blocks * sizeof(void*)); kv_cache_block_offsets_h = (int32_t*) malloc(num_total_blocks * sizeof(int32_t)); - size_t const paged_kv_block_size_in_bytes = get_size_in_bytes(tokens_per_block * h_kv * std::gcd(d, dv), data_type); + const size_t paged_kv_block_size_in_bytes = get_size_in_bytes(tokens_per_block * h_kv * std::gcd(d, dv), data_type); FMHA_CHECK_CUDA(cudaMalloc((void**) (&kv_cache_block_offsets_d), num_total_blocks * sizeof(int32_t))); - size_t const kv_cache_pool_sz + const size_t kv_cache_pool_sz = get_size_in_bytes(num_total_blocks * tokens_per_block * h_kv * (d + dv) / 2, data_type); FMHA_CHECK_CUDA(cudaMalloc((void**) (&kv_cache_pool_ptr), kv_cache_pool_sz)); size_t ptr_index = 0; @@ -1104,7 +1118,7 @@ int main(int argc, char** argv) // Q will always be [B, S, H, Dh] with paged kv cache. void* q_d; - size_t const q_size = s * b * h * d; + const size_t q_size = s * b * h * d; FMHA_CHECK_CUDA(cudaMalloc(&q_d, get_size_in_bytes(q_size, data_type))); // K has [B, S, H_kv, D] with separate kv cache. @@ -1122,11 +1136,11 @@ int main(int argc, char** argv) FMHA_CHECK_CUDA(cudaMalloc(&scale_bmm2_d, sizeof(uint32_t))); // The mask for dropout or any mask patterns. - size_t const mask_size = s * b * s; + const size_t mask_size = s * b * s; // Allocate on the host. float* mask_h = (float*) malloc(mask_size * sizeof(float)); // The size in bytes. - size_t const mask_size_in_bytes = get_size_in_bytes(mask_size, DATA_TYPE_INT8); + const size_t mask_size_in_bytes = get_size_in_bytes(mask_size, DATA_TYPE_INT8); // Allocate on the device. void* mask_d = nullptr; if (!skip_checks) @@ -1158,7 +1172,7 @@ int main(int argc, char** argv) v1 ? 1 : 2); // The number of threads per CTA. - size_t const threads_per_cta = warps_m * warps_n * warps_k * 32; + const size_t threads_per_cta = warps_m * warps_n * warps_k * 32; // The number of mmas in the M dimension. We use one uint32_t per MMA in the M dimension. size_t mmas_m = (s + 16 * warps_m - 1) / (16 * warps_m); // The number of mmas in the N dimension. @@ -1182,7 +1196,7 @@ int main(int argc, char** argv) packed_mask_size = b * mmas_m * mmas_n * threads_per_cta; } // The size in bytes. - size_t const packed_mask_size_in_bytes = packed_mask_size * sizeof(uint32_t); + const size_t packed_mask_size_in_bytes = packed_mask_size * sizeof(uint32_t); // Allocate on the host. uint32_t* packed_mask_h = (uint32_t*) malloc(packed_mask_size_in_bytes); // Set it to 0 (indicates that all elements are valid). @@ -1190,12 +1204,30 @@ int main(int argc, char** argv) // Allocate on the device. void* packed_mask_d = nullptr; + // The size of the attention sinks. + const size_t attention_sinks_size_in_bytes = h * sizeof(float); + + // The attention sinks. + void* attention_sinks_d = nullptr; + if (use_attention_sinks) + { + // Allocate on the host. + float* attention_sinks_h = (float*) malloc(attention_sinks_size_in_bytes); + // Randomly initialize the attention sinks. + random_init("attention_sinks", attention_sinks_h, 1, h, 1, false, 5.f, 1.f, verbose); + // Allocate on the device. + FMHA_CHECK_CUDA(cudaMalloc(&attention_sinks_d, attention_sinks_size_in_bytes)); + // Copy from the host to the device. + FMHA_CHECK_CUDA( + cudaMemcpy(attention_sinks_d, attention_sinks_h, attention_sinks_size_in_bytes, cudaMemcpyDefault)); + } + // The O matrix is packed as S * B * H * D. - size_t const o_size = s * b * h * dv; + const size_t o_size = s * b * h * dv; // Allocate on the host. float* o_h = (float*) malloc(o_size * sizeof(float)); // The size in bytes. - size_t const o_size_in_bytes = get_size_in_bytes(o_size, data_type); + const size_t o_size_in_bytes = get_size_in_bytes(o_size, data_type); // Allocate on the device. void* o_d = nullptr; FMHA_CHECK_CUDA(cudaMalloc(&o_d, o_size_in_bytes)); @@ -1206,7 +1238,7 @@ int main(int argc, char** argv) FMHA_CHECK_CUDA(cudaMemset(softmax_stats_d, 0x00, 2 * sizeof(float) * b * s * h)); // The size in bytes. - size_t const tmp_size_in_bytes = get_size_in_bytes(o_size, acc_type); + const size_t tmp_size_in_bytes = get_size_in_bytes(o_size, acc_type); // Allocate on the device. void* tmp_d = nullptr; if (data_type != acc_type) @@ -1220,9 +1252,9 @@ int main(int argc, char** argv) float* softmax_sum_h = (float*) malloc(b * s * h * sizeof(float)); // The P matrix is stored as one big matrix of size S x B x H x S. - size_t const p_size = s * b * h * s; + const size_t p_size = s * b * h * s; // The size in bytes. - size_t const p_size_in_bytes = get_size_in_bytes(p_size, acc_type); + const size_t p_size_in_bytes = get_size_in_bytes(p_size, acc_type); // Allocate on the device. void* p_d = nullptr; if (!skip_checks) @@ -1238,7 +1270,7 @@ int main(int argc, char** argv) #endif // defined(STORE_P) // The size in bytes of the S matrix (the data type may be different from P for int8). - size_t const s_size_in_bytes = get_size_in_bytes(p_size, data_type); + const size_t s_size_in_bytes = get_size_in_bytes(p_size, data_type); // Allocate on the device. void* s_d = nullptr; if (!skip_checks) @@ -1327,7 +1359,7 @@ int main(int argc, char** argv) std::vector seqlens(b, 0); // randomly draw a batch of sequence lengths >= min_s std::transform(seqlens.begin(), seqlens.end(), seqlens.begin(), - [=](uint32_t const) + [=](const uint32_t) { if (fix_s) { @@ -1415,7 +1447,7 @@ int main(int argc, char** argv) FMHA_CHECK_CUDA(cudaMalloc(&mqa_qkv_packed_d, mqa_qkv_packed_size_in_bytes)); FMHA_CHECK_CUDA(cudaMalloc(&mqa_qkv_d, mqa_qkv_size_in_bytes)); - size_t const o_packed_size = cu_seqlens.back() * h * dv; + const size_t o_packed_size = cu_seqlens.back() * h * dv; // Allocate on the host. float* o_packed_h = (float*) malloc(o_packed_size * sizeof(float)); void* o_packed_d = nullptr; @@ -1676,9 +1708,9 @@ int main(int argc, char** argv) total, num_grouped_heads, sliding_window_size, chunked_attention_size, // Paged kv cache. tokens_per_block, qkv_d_view, q_d, k_d, v_d, contiguous_kv_d, kv_cache_pool_ptr, kv_cache_block_offsets_d, - packed_mask_d, cu_mask_rows_d, cu_seqlens_d, cu_q_seqlens_d, o_d_view, p_d, s_d, softmax_stats_ptr, - scale_bmm2_d, scale_bmm1, scale_softmax, scale_bmm2, softcapping_scale_bmm1, use_int8_scale_max, interleaved, - is_s_padded, has_alibi); + packed_mask_d, cu_mask_rows_d, attention_sinks_d, cu_seqlens_d, cu_q_seqlens_d, o_d_view, p_d, s_d, + softmax_stats_ptr, scale_bmm2_d, scale_bmm1, scale_softmax, scale_bmm2, softcapping_scale_bmm1, + use_int8_scale_max, interleaved, is_s_padded, has_alibi); // total number of tokens is needed to set TMA desc on the host. launch_params.total_q_seqlen = q_seqlens[b]; @@ -1894,8 +1926,8 @@ int main(int argc, char** argv) ground_truth(bmm1, bmm2, data_type, acc_type, scale_bmm1, scale_softmax, scale_bmm2, softcapping_scale_bmm1, qkv_sbh3d_d, vt_d, // WAR pass in V' - mask_d, p_d, s_d, tmp_d, o_d, softmax_stats_d, cu_seqlens_d, b, s, h, d, dv, runs, warps_m, warps_n, - has_alibi); + mask_d, attention_sinks_d, p_d, s_d, tmp_d, o_d, softmax_stats_d, cu_seqlens_d, b, s, h, d, dv, runs, + warps_m, warps_n, has_alibi); timer.stop(); FMHA_CHECK_CUDA(cudaPeekAtLastError()); FMHA_CHECK_CUDA(cudaDeviceSynchronize()); @@ -2009,7 +2041,6 @@ int main(int argc, char** argv) // Extract the last s_q tokens from the output. extract_and_transpose_output( o_ref_trans_h.data(), o_ref_h, seqlens, q_seqlens, s, s_q, b, h, dv, is_s_padded); - if (verbose) { printf("\nChecking .....: O = V * S\n"); diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention.h b/cpp/kernels/fmha_v2/src/fused_multihead_attention.h index f77e3f14d0..16e2f9a8db 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_attention.h +++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention.h @@ -197,6 +197,9 @@ struct Fused_multihead_attention_params_v2 : Fused_multihead_attention_params_ba // The stride between rows of softmax_stats_ptr int64_t softmax_stats_stride_in_bytes; + // The attention sinks (per head). + float* attention_sinks; + // array of length b+1 holding prefix sum of actual q sequence lengths. int* cu_q_seqlens; // array of length b+1 holding prefix sum of actual kv sequence lengths. diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention_demo_bert_params.h b/cpp/kernels/fmha_v2/src/fused_multihead_attention_demo_bert_params.h index 76670971e5..bacb4938cf 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_attention_demo_bert_params.h +++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention_demo_bert_params.h @@ -87,6 +87,8 @@ struct Fused_multihead_attention_params_v2 fmha::Kv_block_array paged_kv_cache; // The mask to implement drop-out. void* packed_mask_ptr; + // The attention sinks (per head). + float* attention_sinks; // The O matrix (output). void* o_ptr; // The Softmax stats vector of layout [2, B, S, H], including softmax_sum and softmax_max diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_cross_attention.cpp b/cpp/kernels/fmha_v2/src/fused_multihead_cross_attention.cpp index 8a2e7a8fc0..6e37fc6ab4 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_cross_attention.cpp +++ b/cpp/kernels/fmha_v2/src/fused_multihead_cross_attention.cpp @@ -23,28 +23,30 @@ using Launch_params = bert::Fused_multihead_attention_launch_params; //////////////////////////////////////////////////////////////////////////////////////////////////// -void run_softmax_fp32(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_seqlens_q_d, - int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi); - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -void run_softmax_e4m3(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_seqlens_q_d, - int s_inner, int s_outer, int b, int h, float scale_softmax, float softcapping_scale_bmm1, int warps_n, +void run_softmax_fp32(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_seqlens_q_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi); //////////////////////////////////////////////////////////////////////////////////////////////////// -void run_softmax_fp16(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_seqlens_q_d, - int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi); +void run_softmax_e4m3(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_seqlens_q_d, int s_inner, int s_outer, int b, int h, float scale_softmax, float softcapping_scale_bmm1, + int warps_n, bool has_alibi); //////////////////////////////////////////////////////////////////////////////////////////////////// -void run_softmax_int8(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_seqlens_q_d, - int s_inner, int s_outer, int b, int h, float scale_i2f, float scale_f2i, float softcapping_scale_bmm1, int warps_n, +void run_softmax_fp16(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_seqlens_q_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi); //////////////////////////////////////////////////////////////////////////////////////////////////// +void run_softmax_int8(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_seqlens_q_d, int s_inner, int s_outer, int b, int h, float scale_i2f, float scale_f2i, + float softcapping_scale_bmm1, int warps_n, bool has_alibi); + +//////////////////////////////////////////////////////////////////////////////////////////////////// + void run_conversion_int32_to_int8(void* dst, void const* src, int s, int b, int h, int d, float scale); //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -57,10 +59,10 @@ void run_conversion_fp32_to_e4m3(void* dst, void const* src, int s, int b, int h //////////////////////////////////////////////////////////////////////////////////////////////////// -void ground_truth(RefBMM& bmm1, RefBMM& bmm2, Data_type const data_type, Data_type const acc_type, +void ground_truth(RefBMM& bmm1, RefBMM& bmm2, const Data_type data_type, const Data_type acc_type, float const scale_bmm1, float const scale_softmax, float const scale_bmm2, void* q_d, void* kv_d, void* vt_d, void* mask_d, void* p_d, void* s_d, void* tmp_d, void* o_d, void* softmax_sum_d, void* cu_seqlens_q_d, - size_t const b, size_t const s_q, size_t const s_kv, size_t const h, size_t const d, int const runs, + const size_t b, const size_t s_q, const size_t s_kv, const size_t h, const size_t d, int const runs, int const warps_m, int const warps_n, bool has_alibi) { @@ -84,20 +86,22 @@ void ground_truth(RefBMM& bmm1, RefBMM& bmm2, Data_type const data_type, Data_ty // Softmax. if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP16) { - run_softmax_fp16(s_d, p_d, mask_d, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, 0.f, warps_n, has_alibi); + run_softmax_fp16( + s_d, p_d, mask_d, nullptr, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, 0.f, warps_n, has_alibi); } else if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP32) { - run_softmax_fp32(s_d, p_d, mask_d, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, 0.f, warps_n, has_alibi); + run_softmax_fp32( + s_d, p_d, mask_d, nullptr, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, 0.f, warps_n, has_alibi); } else if (data_type == DATA_TYPE_E4M3 && acc_type == DATA_TYPE_FP32) { - run_softmax_e4m3(s_d, p_d, mask_d, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, scale_softmax, 0.f, - warps_n, has_alibi); + run_softmax_e4m3(s_d, p_d, mask_d, nullptr, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, scale_softmax, + 0.f, warps_n, has_alibi); } else if (data_type == DATA_TYPE_INT8 && acc_type == DATA_TYPE_INT32) { - run_softmax_int8(s_d, p_d, mask_d, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, scale_bmm1, + run_softmax_int8(s_d, p_d, mask_d, nullptr, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, scale_bmm1, scale_softmax, 0.f, warps_n, has_alibi); } else @@ -148,8 +152,8 @@ static inline void set_params(bert::Fused_multihead_attention_params_mhca& param // types Data_type data_type, Data_type acc_type, // sizes - size_t const b, size_t const s_q, size_t const s_kv, size_t const h, size_t const d, size_t const d_padded, - size_t const total, + const size_t b, const size_t s_q, const size_t s_kv, const size_t h, const size_t d, const size_t d_padded, + const size_t total, // device pointers void* q_packed_d, void* kv_packed_d, void* cu_seqlens_q_d, void* cu_seqlens_kv_d, void* o_packed_d, void* p_d, void* s_d, @@ -515,17 +519,17 @@ int main(int argc, char** argv) launch_params.use_tma = use_tma; // The Q matrix of size S_Q x B x H x D. - size_t const q_size = s_q * b * h * d; + const size_t q_size = s_q * b * h * d; // The K and V matrices are packed into one big matrix of size S_KV x B x H x 2 x D. - size_t const kv_size = s_kv_padded * b * h * 2 * d; + const size_t kv_size = s_kv_padded * b * h * 2 * d; // Allocate on the host. float* q_h = (float*) malloc(q_size * sizeof(float)); // Allocate on the host. float* kv_h = (float*) malloc(kv_size * sizeof(float)); // The size in bytes. - size_t const q_size_in_bytes = get_size_in_bytes(q_size, data_type); + const size_t q_size_in_bytes = get_size_in_bytes(q_size, data_type); // The size in bytes. - size_t const kv_size_in_bytes = get_size_in_bytes(kv_size, data_type); + const size_t kv_size_in_bytes = get_size_in_bytes(kv_size, data_type); // Allocate on the device. void* q_d = nullptr; FMHA_CHECK_CUDA(cudaMalloc(&q_d, q_size_in_bytes)); @@ -534,11 +538,11 @@ int main(int argc, char** argv) FMHA_CHECK_CUDA(cudaMalloc(&kv_d, kv_size_in_bytes)); // The mask for dropout. - size_t const mask_size = s_q * b * s_kv_padded; + const size_t mask_size = s_q * b * s_kv_padded; // Allocate on the host. float* mask_h = (float*) malloc(mask_size * sizeof(float)); // The size in bytes. - size_t const mask_size_in_bytes = get_size_in_bytes(mask_size, DATA_TYPE_INT8); + const size_t mask_size_in_bytes = get_size_in_bytes(mask_size, DATA_TYPE_INT8); // Allocate on the device. void* mask_d = nullptr; FMHA_CHECK_CUDA(cudaMalloc(&mask_d, mask_size_in_bytes)); @@ -554,28 +558,28 @@ int main(int argc, char** argv) v1 ? 1 : 2); // The number of threads per CTA. - size_t const threads_per_cta = warps_m * warps_n * warps_k * 32; + const size_t threads_per_cta = warps_m * warps_n * warps_k * 32; // The number of mmas in the M dimension. We use one uint32_t per MMA in the M dimension. - size_t const mmas_m = (s_q + 16 * warps_m - 1) / (16 * warps_m); + const size_t mmas_m = (s_q + 16 * warps_m - 1) / (16 * warps_m); // The number of mmas in the N dimension. - size_t const mmas_n = (s_kv_padded + 16 * warps_n - 1) / (16 * warps_n); + const size_t mmas_n = (s_kv_padded + 16 * warps_n - 1) / (16 * warps_n); // We do not support more than 4 MMAS in the N dimension (as each MMA needs 8 bits in the mask). assert(!v1 || mmas_n <= 4); // The packed mask for dropout (in the fused kernel). Layout is B * MMAS_M * THREADS_PER_CTA. - size_t const packed_mask_size = b * mmas_m * threads_per_cta; + const size_t packed_mask_size = b * mmas_m * threads_per_cta; // The size in bytes. - size_t const packed_mask_size_in_bytes = packed_mask_size * sizeof(uint32_t); + const size_t packed_mask_size_in_bytes = packed_mask_size * sizeof(uint32_t); // Allocate on the host. uint32_t* packed_mask_h = (uint32_t*) malloc(packed_mask_size_in_bytes); // Allocate on the device. void* packed_mask_d = nullptr; // The O matrix is packed as S_Q * B * H * D. - size_t const o_size = s_q * b * h * d; + const size_t o_size = s_q * b * h * d; // Allocate on the host. float* o_h = (float*) malloc(o_size * sizeof(float)); // The size in bytes. - size_t const o_size_in_bytes = get_size_in_bytes(o_size, data_type); + const size_t o_size_in_bytes = get_size_in_bytes(o_size, data_type); // Allocate on the device. void* o_d = nullptr; FMHA_CHECK_CUDA(cudaMalloc(&o_d, o_size_in_bytes)); @@ -587,7 +591,7 @@ int main(int argc, char** argv) FMHA_CHECK_CUDA(cudaMemset(softmax_max_d, 0x00, sizeof(float) * b * s_q * h)); // The size in bytes. - size_t const tmp_size_in_bytes = get_size_in_bytes(o_size, acc_type); + const size_t tmp_size_in_bytes = get_size_in_bytes(o_size, acc_type); // Allocate on the device. void* tmp_d = nullptr; if (data_type != acc_type) @@ -599,9 +603,9 @@ int main(int argc, char** argv) float* o_ref_h = (float*) malloc(o_size * sizeof(float)); // The P matrix is stored as one big matrix of size S_Q x B x H x S_KV. - size_t const p_size = s_q * b * h * s_kv_padded; + const size_t p_size = s_q * b * h * s_kv_padded; // The size in bytes. - size_t const p_size_in_bytes = get_size_in_bytes(p_size, acc_type); + const size_t p_size_in_bytes = get_size_in_bytes(p_size, acc_type); // Allocate on the device. void* p_d = nullptr; FMHA_CHECK_CUDA(cudaMalloc(&p_d, p_size_in_bytes)); @@ -614,7 +618,7 @@ int main(int argc, char** argv) #endif // defined(STORE_P) // The size in bytes of the S matrix (the data type may be different from P for int8). - size_t const s_size_in_bytes = get_size_in_bytes(p_size, data_type); + const size_t s_size_in_bytes = get_size_in_bytes(p_size, data_type); // Allocate on the device. void* s_d = nullptr; FMHA_CHECK_CUDA(cudaMalloc(&s_d, s_size_in_bytes)); @@ -634,9 +638,9 @@ int main(int argc, char** argv) // WAR fOR MISSING CUBLAS FP8 NN SUPPORT. // Transpose V, so that we can do a TN BMM2, i.e. O = S x V' instead of O = S x V. - size_t const v_size = s_kv_padded * b * h * d; + const size_t v_size = s_kv_padded * b * h * d; // The size in bytes. - size_t const v_size_in_bytes = get_size_in_bytes(v_size, data_type); + const size_t v_size_in_bytes = get_size_in_bytes(v_size, data_type); float* vt_h = (float*) malloc(v_size * sizeof(float)); void* vt_d = nullptr; FMHA_CHECK_CUDA(cudaMalloc(&vt_d, v_size_in_bytes)); @@ -676,7 +680,7 @@ int main(int argc, char** argv) = [min_s, fix_s, b](int s, std::vector& seqlens, std::vector& cu_seqlens, void** cu_seqlens_d) { std::transform(seqlens.begin(), seqlens.end(), seqlens.begin(), - [=](uint32_t const) + [=](const uint32_t) { if (fix_s) { @@ -728,7 +732,7 @@ int main(int argc, char** argv) void* kv_packed_d = nullptr; FMHA_CHECK_CUDA(cudaMalloc(&kv_packed_d, kv_packed_size_in_bytes)); - size_t const o_packed_size = cu_seqlens_q.back() * h * d; + const size_t o_packed_size = cu_seqlens_q.back() * h * d; // Allocate on the host. float* o_packed_h = (float*) malloc(o_packed_size * sizeof(float)); float* o_ref_packed_h = (float*) malloc(o_packed_size * sizeof(float)); diff --git a/cpp/kernels/fmha_v2/src/softmax_bf16.cu b/cpp/kernels/fmha_v2/src/softmax_bf16.cu index 5212d31717..79b681b502 100644 --- a/cpp/kernels/fmha_v2/src/softmax_bf16.cu +++ b/cpp/kernels/fmha_v2/src/softmax_bf16.cu @@ -12,9 +12,10 @@ #include "softmax_impl.h" -void run_softmax_bf16(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d, - int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi) +void run_softmax_bf16(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, + bool has_alibi) { - run_softmax(dst, src, mask, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b, h, 0.f, 0.f, - softcapping_scale_bmm1, warps_n, has_alibi); + run_softmax(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, + b, h, 0.f, 0.f, softcapping_scale_bmm1, warps_n, has_alibi); } diff --git a/cpp/kernels/fmha_v2/src/softmax_fp16.cu b/cpp/kernels/fmha_v2/src/softmax_fp16.cu index 1fb68b1136..9df37605a2 100644 --- a/cpp/kernels/fmha_v2/src/softmax_fp16.cu +++ b/cpp/kernels/fmha_v2/src/softmax_fp16.cu @@ -12,9 +12,10 @@ #include "softmax_impl.h" -void run_softmax_fp16(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d, - int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi) +void run_softmax_fp16(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, + bool has_alibi) { - run_softmax(dst, src, mask, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b, h, 0.f, 0.f, - softcapping_scale_bmm1, warps_n, has_alibi); + run_softmax(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b, + h, 0.f, 0.f, softcapping_scale_bmm1, warps_n, has_alibi); } diff --git a/cpp/kernels/fmha_v2/src/softmax_fp32.cu b/cpp/kernels/fmha_v2/src/softmax_fp32.cu index 2b3bb6acbb..12bcd8624d 100644 --- a/cpp/kernels/fmha_v2/src/softmax_fp32.cu +++ b/cpp/kernels/fmha_v2/src/softmax_fp32.cu @@ -12,9 +12,10 @@ #include "softmax_impl.h" -void run_softmax_fp32(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d, - int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi) +void run_softmax_fp32(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, + bool has_alibi) { - run_softmax(dst, src, mask, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b, h, 0.f, 0.f, - softcapping_scale_bmm1, warps_n, has_alibi); + run_softmax(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, + b, h, 0.f, 0.f, softcapping_scale_bmm1, warps_n, has_alibi); } diff --git a/cpp/kernels/fmha_v2/src/softmax_fp8.cu b/cpp/kernels/fmha_v2/src/softmax_fp8.cu index 0a8e5f5029..26c2f5e88d 100644 --- a/cpp/kernels/fmha_v2/src/softmax_fp8.cu +++ b/cpp/kernels/fmha_v2/src/softmax_fp8.cu @@ -12,10 +12,10 @@ #include "softmax_impl.h" -void run_softmax_e4m3(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d, - int s_inner, int s_outer, int b, int h, float scale_softmax, float softcapping_scale_bmm1, int warps_n, - bool has_alibi) +void run_softmax_e4m3(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float scale_softmax, float softcapping_scale_bmm1, + int warps_n, bool has_alibi) { - run_softmax(dst, src, mask, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b, h, 0.f, - scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi); + run_softmax(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, + b, h, 0.f, scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi); } diff --git a/cpp/kernels/fmha_v2/src/softmax_impl.h b/cpp/kernels/fmha_v2/src/softmax_impl.h index 2bc9f3380b..ca65262744 100644 --- a/cpp/kernels/fmha_v2/src/softmax_impl.h +++ b/cpp/kernels/fmha_v2/src/softmax_impl.h @@ -10,6 +10,7 @@ * its affiliates is strictly prohibited. */ +#include #include #include #include @@ -33,6 +34,8 @@ struct Softmax_params Src_type const* src; // Masks. int8_t const* mask; + // Attention sinks (per head). + float const* attention_sinks; // Softmax sum pointer. float* softmax_sum; // ALiBi @@ -148,7 +151,8 @@ static inline __device__ float apply_exp_(float x, float max) //////////////////////////////////////////////////////////////////////////////////////////////////// template -static inline __device__ void reduce(float (&data_fp32)[N][1], int8_t const (&mask)[N][1], int warps_n, float& sum_fp32) +static inline __device__ void reduce( + float (&data_fp32)[N][1], const int8_t (&mask)[N][1], int warps_n, float& sum_fp32, float const attention_sink) { // Apply the masks. @@ -233,7 +237,7 @@ static inline __device__ void reduce(float (&data_fp32)[N][1], int8_t const (&ma } // Normalize. - float inv_sum_fp32 = 1.f / sum_fp32; + float inv_sum_fp32 = 1.f / (sum_fp32 + expf(attention_sink - max_fp32)); #pragma unroll for (int ii = 0; ii < N; ++ii) { @@ -244,7 +248,8 @@ static inline __device__ void reduce(float (&data_fp32)[N][1], int8_t const (&ma //////////////////////////////////////////////////////////////////////////////////////////////////// template -static inline __device__ void reduce(float (&data_fp32)[N][2], int8_t const (&mask)[N][2], int warps_n, float& sum_fp32) +static inline __device__ void reduce( + float (&data_fp32)[N][2], const int8_t (&mask)[N][2], int warps_n, float& sum_fp32, float const attention_sink) { // Apply the masks. #pragma unroll @@ -401,7 +406,7 @@ static inline __device__ void reduce(float (&data_fp32)[N][2], int8_t const (&ma } // Normalize. - float inv_sum_fp32 = 1.f / sum_fp32; + float inv_sum_fp32 = 1.f / (sum_fp32 + expf(attention_sink - max_fp32)); #pragma unroll for (int ii = 0; ii < N; ++ii) { @@ -413,7 +418,8 @@ static inline __device__ void reduce(float (&data_fp32)[N][2], int8_t const (&ma //////////////////////////////////////////////////////////////////////////////////////////////////// template -static inline __device__ void reduce(float (&data_fp32)[N][4], int8_t const (&mask)[N][4], int warps_n, float& sum_fp32) +static inline __device__ void reduce( + float (&data_fp32)[N][4], const int8_t (&mask)[N][4], int warps_n, float& sum_fp32, float const attention_sink) { // Apply the masks. @@ -824,7 +830,7 @@ static inline __device__ void reduce(float (&data_fp32)[N][4], int8_t const (&ma } // Normalize. - float inv_sum_fp32 = 1.f / sum_fp32; + float inv_sum_fp32 = 1.f / (sum_fp32 + expf(attention_sink - max_fp32)); #pragma unroll for (int ii = 0; ii < N; ++ii) { @@ -994,9 +1000,16 @@ static __global__ void softmax_kernel(Softmax_params params) } } + // The attention sink value. + float attention_sink = -FLT_MAX; + if (params.attention_sinks != nullptr) + { + attention_sink = params.attention_sinks[hi]; + } + // Do the reduction. float sum_fp32 = 0.f; - reduce(data_fp32, mask_, params.warps_n, sum_fp32); + reduce(data_fp32, mask_, params.warps_n, sum_fp32, attention_sink); if (threadIdx.x == 0) { int sum_s = params.cu_q_seqlens[bi]; @@ -1025,9 +1038,9 @@ static __global__ void softmax_kernel(Softmax_params params) //////////////////////////////////////////////////////////////////////////////////////////////////// template -void run_softmax(void* dst, void const* src, void const* mask, void* softmax_sum, void* cu_q_seqlens, int s_inner, - int s_outer, int b, int h, float scale_bmm1, float scale_softmax, float softcapping_scale_bmm1, int warps_n, - bool has_alibi) +void run_softmax(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum, + void* cu_q_seqlens, int s_inner, int s_outer, int b, int h, float scale_bmm1, float scale_softmax, + float softcapping_scale_bmm1, int warps_n, bool has_alibi) { Softmax_params params; @@ -1039,6 +1052,7 @@ void run_softmax(void* dst, void const* src, void const* mask, void* softmax_sum params.softmax_sum = reinterpret_cast(softmax_sum); params.cu_q_seqlens = reinterpret_cast(cu_q_seqlens); params.mask = reinterpret_cast(mask); + params.attention_sinks = reinterpret_cast(attention_sinks); params.has_alibi = has_alibi; // The dimensions and precomputed values. diff --git a/cpp/kernels/fmha_v2/src/softmax_int8.cu b/cpp/kernels/fmha_v2/src/softmax_int8.cu index 772fe1520c..28701de978 100644 --- a/cpp/kernels/fmha_v2/src/softmax_int8.cu +++ b/cpp/kernels/fmha_v2/src/softmax_int8.cu @@ -12,10 +12,10 @@ #include "softmax_impl.h" -void run_softmax_int8(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d, - int s_inner, int s_outer, int b, int h, float scale_bmm1, float scale_softmax, float softcapping_scale_bmm1, - int warps_n, bool has_alibi) +void run_softmax_int8(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d, + void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float scale_bmm1, float scale_softmax, + float softcapping_scale_bmm1, int warps_n, bool has_alibi) { - run_softmax(dst, src, mask, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b, h, scale_bmm1, - scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi); + run_softmax(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b, h, + scale_bmm1, scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi); } diff --git a/cpp/kernels/xqa/mha.cu b/cpp/kernels/xqa/mha.cu index c9690cbc6b..69d93e901c 100644 --- a/cpp/kernels/xqa/mha.cu +++ b/cpp/kernels/xqa/mha.cu @@ -1379,6 +1379,19 @@ __device__ inline ThrdRegRowMax mergeRowMax( return mergedRowMax; } +__device__ inline void addAttentionSinks( + ThrdRegRowMax& globalRowSum, ThrdRegRowMax const globalRowMax, float const* attentionSinks) +{ + for (uint32_t i = 0; i < globalRowSum.size; i++) + { + uint32_t srcOffset = warp_size * i + laneId(); + if (srcOffset < headGrpSize) + { + globalRowSum[i] += expf(attentionSinks[srcOffset] - globalRowMax[i]); + } + } +} + #ifdef NDEBUG __device__ __forceinline__ #else @@ -1405,6 +1418,7 @@ CUBIN_EXPORT __global__ #if SPEC_DEC MaskType const* __restrict__ mask, // [qSeqLen, divUp(qSeqLen, 32)]. #endif + float const* attentionSinks, // [headGrpSize] #ifdef NDEBUG KVCacheList const& cacheList, #if BEAM_WIDTH > 1 @@ -2371,6 +2385,12 @@ CUBIN_EXPORT __global__ float voScale = (isKVCacheQuantized ? kvCacheScale[0] : 1.F); if (seqIterInit < nbSeqIters) { // otherwise rcpRowSum will be NAN. + // The attention sinks are moved to the multi-block reduction part if the multi-block is enabled. + if (!isMultiBlock && attentionSinks != nullptr) + { + // Attention sinks are per head. + addAttentionSinks(globalRowSum, globalRowMax, attentionSinks + headGrpSize * idxHeadGrp); + } ThrdRegRowMax const rcpRowSum = __frcp_rn(globalRowSum); #if LOW_PREC_OUTPUT voScale *= rcpOutScale[0]; @@ -2559,6 +2579,11 @@ CUBIN_EXPORT __global__ assert(std::isfinite(mergedRowSum[0])); } } + if (attentionSinks != nullptr) + { + // Attention sinks are per head. + addAttentionSinks(mergedRowSum, mergedRowMax, attentionSinks + headGrpSize * idxHeadGrp); + } __syncthreads(); rescaleAcc(warp, sumAcc, fullRescaleMask, __frcp_rn(mergedRowSum)); GemmOutRegTile const mergedOutTile = toFp16(sumAcc); @@ -2615,6 +2640,7 @@ CUBIN_EXPORT __global__ __launch_bounds__(256, nbCtaPerSM) void kernel_mha( MaskType const* __restrict__ mask, // [qSeqLen, divUp(qSeqLen, 32))] uint2 (each bit represents mask for one col // position). #endif + float const* attentionSinks, // [headGrpSize] KVCacheList const cacheList, #if BEAM_WIDTH > 1 BeamSearchParams const beamSearchParams, @@ -2640,7 +2666,7 @@ CUBIN_EXPORT __global__ __launch_bounds__(256, nbCtaPerSM) void kernel_mha( #if SPEC_DEC mask, #endif - cacheList, + attentionSinks, cacheList, #if BEAM_WIDTH > 1 beamSearchParams, #endif @@ -2667,6 +2693,7 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads, #else InputHead const* q, #endif + float const* attentionSinks, // [headGrpSize] #if USE_PAGED_KV_CACHE #if PAGED_KV_CACHE_LAYOUT == 1 GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM, @@ -2760,7 +2787,7 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads, #if SPEC_DEC mask, #endif - cacheList, + attentionSinks, cacheList, #if BEAM_WIDTH > 1 beamSearchParams, #endif @@ -2788,7 +2815,7 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads, #if SPEC_DEC mask, #endif - cacheList, + attentionSinks, cacheList, #if BEAM_WIDTH > 1 beamSearchParams, #endif diff --git a/cpp/kernels/xqa/mha.h b/cpp/kernels/xqa/mha.h index 39c94f985e..d35ad48104 100644 --- a/cpp/kernels/xqa/mha.h +++ b/cpp/kernels/xqa/mha.h @@ -101,6 +101,7 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t const nbKHeads, #else InputHead const* q, #endif + float const* attentionSinks, // [headGrpSize] #if USE_PAGED_KV_CACHE #if PAGED_KV_CACHE_LAYOUT == 1 GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM, @@ -140,6 +141,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads, #else InputHead const* q, #endif + float const* attentionSinks, // [headGrpSize] #if USE_PAGED_KV_CACHE #if PAGED_KV_CACHE_LAYOUT == 1 GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM, diff --git a/cpp/kernels/xqa/mha_sm90.cu b/cpp/kernels/xqa/mha_sm90.cu index 88d4c75e30..9a438df9a2 100644 --- a/cpp/kernels/xqa/mha_sm90.cu +++ b/cpp/kernels/xqa/mha_sm90.cu @@ -428,6 +428,7 @@ __device__ RegColWiseVec computeWarpColSum(Gemm0Acc& src); __device__ void storeGemm0AccToShm( uint32_t warpRank, uint32_t lane, SharedMem::XBuffer& smemX, CtaBarrier& barConsumed, Gemm0Acc const& acc); __device__ RegColWiseVec loadShmColWiseVecWithDup(ShmQWiseVec const& smemVec); +__device__ RegColWiseVec loadGmemColWiseVecWithDup(ShmQWiseVec const& gmemVec, uint32_t bound); #else __device__ RegRowWiseVec computeWarpGrpRowMax_sync(uint32_t warpRank, ShmQWiseVec& smemColMax, Gemm0Acc const& src); __device__ void warpGrpApplyMask(Gemm0Acc& acc, uint32_t validColBeg, uint32_t validColEnd); @@ -453,7 +454,8 @@ __device__ void rescaleGemm1AccForNewColMax_sync(uint32_t warpRank, ShmQWiseVec template __device__ void finalizeAndWriteOut_sync(uint32_t threadRank, uint32_t warpRank, DstHead* dst, SharedMem::OutSwizzleBuf& swizzleBuf, Gemm1Acc& acc, float xvoScale, CtaBarrier& warpGrpBar, - ShmQWiseVec const& accColSum, uint32_t nbKHeads = 0 /* only for final result in spec dec. */); + ShmQWiseVec const& accColSum, ShmQWiseVec const& accColMax, ShmQWiseVec const* attentionSinksVec, + uint32_t nbKHeads = 0 /* only for final result in spec dec. */); #else __device__ void transposeVTile( uint32_t warpRank, uint32_t lane, SharedMem::VTBuffer& dst, SharedMem::VBuffer const& src); @@ -651,6 +653,7 @@ CUBIN_EXPORT __global__ #else IOHead const* __restrict__ const q, // [nbReq][beamWidth][nbQHeads], #endif + float const* attentionSinks, // [headGrpSize] KVCacheList const cacheList, #if USE_BEAM_SEARCH BeamSearchParams const beamSearchParams, @@ -1252,7 +1255,7 @@ CUBIN_EXPORT __global__ IOHead* const dst = (scratchMem.tokens() + idxChunk).template cast(); #if SWAP_AB finalizeAndWriteOut_sync(threadIdx.x, warpRank, dst, smem.outSwizzleBuf(idxXBuf), acc, xvoScale, - smem.gemm1WarpGrpBar, smem.gemm1AccColSum); + smem.gemm1WarpGrpBar, smem.gemm1AccColSum, smem.gemm1AccColMax, nullptr); #else finalizeAndWriteOut_sync(warpRank, dst, smem.outSwizzleBuf(idxXBuf), acc, xvoScale, smem.gemm1AccColSum, 1, ctaNbValidTokens); @@ -1262,9 +1265,16 @@ CUBIN_EXPORT __global__ { uint32_t const outOffset = headGrpSize * (nbKHeads * (beamWidth * ctaInputTokBeg) + idxHeadGrp); OutputHead* const dst = &output[outOffset]; + ShmQWiseVec const* attentionSinksVec = nullptr; + if (attentionSinks != nullptr) + { + attentionSinksVec + = reinterpret_cast(attentionSinks + headGrpSize * idxHeadGrp); + } #if SWAP_AB finalizeAndWriteOut_sync(threadIdx.x, warpRank, dst, smem.outSwizzleBuf(idxXBuf), acc, - xvoScale, smem.gemm1WarpGrpBar, smem.gemm1AccColSum, nbKHeads); + xvoScale, smem.gemm1WarpGrpBar, smem.gemm1AccColSum, smem.gemm1AccColMax, attentionSinksVec, + nbKHeads); #else finalizeAndWriteOut_sync(warpRank, dst, smem.outSwizzleBuf(idxXBuf), acc, xvoScale, smem.gemm1AccColSum, nbKHeads, ctaNbValidTokens); @@ -1585,6 +1595,17 @@ CUBIN_EXPORT __global__ } unused(bar.consumed.arrive()); } + // Add the attention sinks. + if (attentionSinks != nullptr) + { + for (uint32_t i = 0; i < headsPerWarp; i++) + { + uint32_t const idxHead = wid + nbMathWarps * i; + float sink = expf( + attentionSinks[mha::min(idxHead, headGrpSize - 1) + idxHeadGrp * headGrpSize] - states[i].max); + states[i].sum += sink; + } + } __syncthreads(); uint32_t const outOffset = headGrpSize * (nbKHeads * (beamWidth * ctaInputTokBeg) + idxHeadGrp); auto const dst = &output[outOffset]; @@ -2029,6 +2050,22 @@ __device__ inline RegColWiseVec loadShmColWiseVecWithDup(ShmQWiseVec const& smem return ret; } +__device__ inline RegColWiseVec loadGmemColWiseVecWithDup(ShmQWiseVec const& gmemVec, uint32_t bound) +{ + RegColWiseVec ret; + constexpr uint32_t nbThrdsPerInstNBase = exactDiv(gmma::instNBase, GmmaAccCoreMat::cols); + auto const idx = laneId() % nbThrdsPerInstNBase; +#pragma unroll + for (uint32_t i = 0; i < exactDiv(ShmQWiseVec::size, gmma::instNBase); i++) + { + static_assert(nbThrdsPerInstNBase * RegColWiseVec::size == exactDiv(ShmQWiseVec::size, GmmaAccCoreMat::cols)); + ret[i] = reinterpret_cast< + Vec, exactDiv(ShmQWiseVec::size, GmmaAccCoreMat::cols)> const&>( + gmemVec)[mha::min(i * nbThrdsPerInstNBase + idx, bound)]; + } + return ret; +} + __device__ inline void warpGrpApplyMask(uint32_t warpRank, Gemm0Acc& acc, uint32_t validRowBeg, uint32_t validRowEnd) { uint32_t const idxInQuad = laneId() % 4; @@ -2878,12 +2915,19 @@ __device__ inline void saveTransposedOutput(uint32_t threadRank, uint32_t warpRa template __device__ inline void finalizeAndWriteOut_sync(uint32_t threadRank, uint32_t warpRank, DstHead* dst, SharedMem::OutSwizzleBuf& swizzleBuf, Gemm1Acc& acc, float xvoScale, CtaBarrier& warpGrpBar, - ShmQWiseVec const& accColSum, uint32_t nbKHeads) + ShmQWiseVec const& accColSum, ShmQWiseVec const& accColMax, ShmQWiseVec const* attentionSinksVec, uint32_t nbKHeads) { // @fixme: if ctaNbQHeads is large, use loadShmColWiseVecNoDup + rcp + shfl to avoid 8x waste of mufu.rcp // static_assert(ctaNbQHeads <= 8, "Warning: consider using loadShmColWiseVecNoDup + rcp + shfl to avoid 8x waste of // mufu.rcp"); - auto const regColSum = loadShmColWiseVecWithDup(accColSum); + auto regColSum = loadShmColWiseVecWithDup(accColSum); + if (attentionSinksVec != nullptr) + { + auto const regAccColMax = loadShmColWiseVecWithDup(accColMax); + auto const regAttentionSinks = loadGmemColWiseVecWithDup(attentionSinksVec[0], headGrpSize - 1); + auto regColSinks = expf(regAttentionSinks - regAccColMax); + regColSum = regColSum + regColSinks; + } auto const regOutScale = __frcp_rn(regColSum) * xvoScale; rescaleAcc(acc, regOutScale); @@ -3175,6 +3219,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads, #else InputHead const* q, #endif + float const* attentionSinks, // [headGrpSize] #if USE_PAGED_KV_CACHE #if PAGED_KV_CACHE_LAYOUT == 1 GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM, @@ -3286,7 +3331,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads, #else q, #endif - cacheList, + attentionSinks, cacheList, #if USE_BEAM_SEARCH beamSearchParams, #endif @@ -3322,7 +3367,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads, #else q, #endif - cacheList, + attentionSinks, cacheList, #if USE_BEAM_SEARCH beamSearchParams, #endif diff --git a/cpp/kernels/xqa/mla_sm120.cu b/cpp/kernels/xqa/mla_sm120.cu index 74877512a7..072908fe3e 100644 --- a/cpp/kernels/xqa/mla_sm120.cu +++ b/cpp/kernels/xqa/mla_sm120.cu @@ -1859,12 +1859,13 @@ CUtensorMap makeTensorMapForQ( #endif // IS_MLA void launchMLA(cudaDeviceProp const& prop, - uint32_t inputSeqLen, // uniform for all requests and causal mask is assumed + uint32_t inputSeqLen, // uniform for all requests and causal mask is assumed float qScale, OutputHead* output, InputHead const* q, + float* attentionSinks, // [headGrpSize], not supported. #if USE_PAGED_KV_CACHE - GMemCacheHead* pool, // global pool of pages + GMemCacheHead* pool, // global pool of pages KVCachePageIndex const* - kvCachePageList, // device pointer. shape: KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq] + kvCachePageList, // device pointer. shape: KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq] #else GMemKVCacheHead* kvCacheData, #endif diff --git a/cpp/kernels/xqa/test/refAttention.cpp b/cpp/kernels/xqa/test/refAttention.cpp index d8f1a688f5..dd356c101c 100644 --- a/cpp/kernels/xqa/test/refAttention.cpp +++ b/cpp/kernels/xqa/test/refAttention.cpp @@ -45,7 +45,7 @@ using Vector = Matrix; template Eigen::Matrix refFlashAttention(IOHead const* q, CacheSeq const& k, CacheSeq const& v, uint32_t seqLen, float qScale, - float kvScale, float xScale, uint32_t slidingWinSize) + float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks) { uint32_t const nbTiles = divUp(seqLen, tileSize); auto gemm1Acc = Eigen::Matrix::Zero().eval(); @@ -113,6 +113,16 @@ Eigen::Matrix refFlashAt } rowSum += tileRowSum; } + + // Add the attention sinks. + if (attentionSinks != nullptr) + { + for (uint32_t i = 0; i < headGrpSize; i++) + { + rowSum[i] += expf(attentionSinks[i] - rowMax[i]); + } + } + Eigen::Matrix out = gemm1Acc.array().colwise() * (xScale * kvScale / rowSum.array()); std::for_each(out.data(), out.data() + out.size(), [](float& e) { e = float(OutputElem(e)); }); @@ -123,7 +133,7 @@ Eigen::Matrix refFlashAt template Eigen::Matrix \ refFlashAttention(IOHead const* q, \ CacheSeq const& k, CacheSeq const& v, uint32_t seqLen, \ - float qScale, float kvScale, float xScale, uint32_t slidingWinSize) + float qScale, float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks) INSTANTIATE_refFlashAttention(CacheElem, 64, false, false); INSTANTIATE_refFlashAttention(CacheElem, 64, false, true); @@ -143,7 +153,7 @@ Eigen::Matrix refAttenti #else Eigen::Matrix refAttention(IOHead const* q, CacheSeq const& k, CacheSeq const& v, uint32_t seqLen, float qScale, - float kvScale, float xScale, uint32_t slidingWinSize) + float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks) { #endif float const rcpXScale = 1.f / xScale; @@ -184,7 +194,7 @@ Eigen::Matrix refAttenti Eigen::Matrix x = (gemm0Acc.colwise() - rowMax).array().exp().eval(); - Eigen::Vector const rowSum = x.rowwise().sum().eval(); + Eigen::Vector rowSum = x.rowwise().sum().eval(); std::for_each(x.data(), x.data() + x.size(), [&](float& e) { e = float(MathElem(e * rcpXScale)); }); @@ -200,6 +210,18 @@ Eigen::Matrix refAttenti } } } + + // Add the attention sinks. +#if !SPEC_DEC + if (attentionSinks != nullptr) + { + for (uint32_t i = 0; i < headGrpSize; i++) + { + rowSum[i] += expf(attentionSinks[i] - rowMax[i]); + } + } +#endif + Eigen::Matrix out = gemm1Acc.array().colwise() * (xScale * kvScale / rowSum.array()); std::for_each(out.data(), out.data() + out.size(), [](float& e) { e = float(OutputElem(e)); }); @@ -217,7 +239,7 @@ Eigen::Matrix refAttenti template Eigen::Matrix \ refAttention(IOHead const* q, CacheSeq const& k, \ CacheSeq const& v, uint32_t seqLen, float qScale, float kvScale, float xScale, \ - uint32_t slidingWinSize) + uint32_t slidingWinSize, float* attentionSinks) #endif INSTANTIATE_refAttention(InputElem, false, false); INSTANTIATE_refAttention(InputElem, false, true); diff --git a/cpp/kernels/xqa/test/refAttention.h b/cpp/kernels/xqa/test/refAttention.h index bfab141829..a073ed0e80 100644 --- a/cpp/kernels/xqa/test/refAttention.h +++ b/cpp/kernels/xqa/test/refAttention.h @@ -83,7 +83,7 @@ struct CacheSeq template Eigen::Matrix refFlashAttention(IOHead const* q, CacheSeq const& k, CacheSeq const& v, uint32_t seqLen, float qScale, - float kvScale, float xScale, uint32_t slidingWinSize); + float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks); template #if SPEC_DEC @@ -93,7 +93,7 @@ Eigen::Matrix refAttenti #else Eigen::Matrix refAttention(IOHead const* q, CacheSeq const& k, CacheSeq const& v, uint32_t seqLen, float qScale, - float kvScale, float xScale, uint32_t slidingWinSize); + float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks); #endif template diff --git a/cpp/kernels/xqa/test/test.cpp b/cpp/kernels/xqa/test/test.cpp index b922857862..91b35f3e1a 100644 --- a/cpp/kernels/xqa/test/test.cpp +++ b/cpp/kernels/xqa/test/test.cpp @@ -130,7 +130,7 @@ template #endif #endif void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, bool verbose = false, - bool saveData = false, uint32_t ctxLen = ~0U, uint32_t slidingWinSize = 1U << 30) + bool saveData = false, bool hasAttentionSinks = false, uint32_t ctxLen = ~0U, uint32_t slidingWinSize = 1U << 30) { #if IS_MLA if (nbKHeads != 1) @@ -613,6 +613,17 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, } } + // Allocate the attention sinks (per head) + auto attentionSinks = ManagedMemBuf(nbQHeads); + // The attention sinks ptr. + float* attentionSinksPtr = hasAttentionSinks ? reinterpret_cast(attentionSinks.get()) : nullptr; + // Initialize the attention sinks (use large values to detect the potential bugs). + for (uint32_t i = 0; i < nbQHeads; i++) + { + // Range: [2, 5] + attentionSinks.get()[i] = 2.f + float(i % 4); + } + if (verbose) { printf("migrating data to gpu\n"); @@ -640,6 +651,7 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, #if BEAM_WIDTH > 1 cacheIndir.prefetch(dev, stream); #endif + attentionSinks.prefetch(dev, stream); }; prefetchToDevice(device); checkCuda(cudaMemsetAsync(semaphores.get(), 0, 4 * nbSemaphores, stream)); @@ -720,6 +732,7 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, &qHeads[0][0][0], #endif #endif + attentionSinksPtr, #if PAGED_KV_CACHE_LAYOUT == 1 && USE_PAGED_KV_CACHE cacheKHeads.get(), cacheVHeads.get(), #else @@ -1028,10 +1041,13 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, hostMask, qSeqLen, q_len); #else Eigen::Matrix refOutput; + auto const refAttentionSinks + = hasAttentionSinks ? attentionSinksPtr + headGrpSize * idxKHead : nullptr; if (useQGMMA) { refOutput = refFlashAttention(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq, - vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize); + vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize, + refAttentionSinks); // refOutput = refAttention(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq, // vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize); } @@ -1039,8 +1055,9 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, { // refOutput = refFlashAttention(&qHeads[req][b][headGrpSize * idxKHead], // kCacheSeq, vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale); - refOutput = refAttention(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq, - vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize); + refOutput + = refAttention(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq, vCacheSeq, + seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize, refAttentionSinks); } #endif if (lowPrecOutput) @@ -1196,11 +1213,23 @@ TEST(RefCheck, llama_V2_70b) runTest<2>(2, 514, false, true); runTest<1>(1, 4096, false, true); #if SLIDING_WINDOW - runTest<2>(2, 4096, false, true, false, false, ~0, 256); - runTest<2>(2, 400, false, true, false, false, ~0U, 256); + runTest<2>(2, 4096, false, true, false, false, false, ~0, 256); + runTest<2>(2, 400, false, true, false, false, false, ~0U, 256); #endif runTest<8>(120, 367, false, true); - // runTest<8>(1792, 2048, false, true); + runTest<8>(1792, 2048, false, true); +} + +TEST(RefCheck, attention_sinks) +{ + auto runAttentionSinksTest = [](uint32_t batchSize, uint32_t seqLen) + { runTest<8>(batchSize, seqLen, false, true, false, false, /*hasAttentionSinks*/ true); }; + + runAttentionSinksTest(2, 2); + runAttentionSinksTest(2, 15); + runAttentionSinksTest(2, 256); + runAttentionSinksTest(2, 514); + runAttentionSinksTest(1, 4096); } TEST(Perf, tracing_long) @@ -1264,7 +1293,7 @@ TEST(Perf, mlperf_gptj) #ifndef NDEBUG GTEST_SKIP() << "Skipping perf tests for debug build"; #endif - runTest<32>(396, 800 + 224, true, false, false, false, 800); + runTest<32>(396, 800 + 224, true, false, false, false, false, 800); } TEST(Perf, mlperf_llama) diff --git a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h index 565c170e1d..ba0c788277 100644 --- a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h +++ b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h @@ -53,6 +53,7 @@ using namespace CUTLASS_MOE_GEMM_KERNELS_NAMESPACE; using CUTLASS_MOE_GEMM_NAMESPACE::TmaWarpSpecializedGroupedGemmInput; using CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::CutlassMoeFCRunner; using CUTLASS_MOE_GEMM_NAMESPACE::ActivationType; +using CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::ActivationParams; using CUTLASS_MOE_GEMM_NAMESPACE::isGatedActivation; static BufferManager::CudaStreamPtr streamPtr; @@ -984,7 +985,7 @@ public: mSelectedExperts + mSelectedExpertsSize * mBufferIndex, mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr, mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex, - mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex, + ActivationParams(mActType), mExpertWeight2 + mExpertWeight2Size * mBufferIndex, mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex, mFinalOutput + mFinalOutputSize * mBufferIndex, @@ -996,7 +997,7 @@ public: mSelectedExperts + mSelectedExpertsSize * mBufferIndex, mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr, mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex, - mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex, + ActivationParams(mActType), mExpertWeight2 + mExpertWeight2Size * mBufferIndex, mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex, mFinalOutput + mFinalOutputSize * mBufferIndex, diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp index 6e1498ba71..b71fdd9b53 100644 --- a/cpp/tensorrt_llm/common/attentionOp.cpp +++ b/cpp/tensorrt_llm/common/attentionOp.cpp @@ -55,6 +55,7 @@ struct FusedQKVMaskedAttentionDispatchParams T const* qkv_bias; T const* relative_attention_bias; bool const* attention_mask; + float const* attention_sinks; float const* logn_scaling_ptr; int const* cache_indir; void* context_buf; @@ -71,6 +72,7 @@ struct FusedQKVMaskedAttentionDispatchParams RotaryScalingType rotary_embedding_scale_type; float rotary_embedding_scale; float const* rotary_embedding_inv_freq_cache; + float2 const* rotary_embedding_cos_sin_cache; float rotary_embedding_short_m_scale; float rotary_embedding_long_m_scale; int rotary_embedding_max_positions; @@ -225,6 +227,7 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams& xqaParams.output = generationsParams.context_buf; xqaParams.qkv = generationsParams.attention_input; xqaParams.cache_indir = generationsParams.cache_indir; + xqaParams.attention_sinks = generationsParams.attention_sinks; xqaParams.kv_scale_orig_quant = generationsParams.kv_scale_orig_quant; xqaParams.kv_scale_quant_orig = generationsParams.kv_scale_quant_orig; xqaParams.host_past_key_value_lengths = generationsParams.host_past_key_value_lengths; @@ -596,6 +599,7 @@ void fusedQKV_masked_attention_dispatch(Multihead_attention_params const& params, cudaStrea fmhaParams.outputPtr = mCpSize > 1 ? gatherOutBuffer : params.context_buf; // only use [totalLength, h / cpSize, Dh] fmhaParams.outputSfPtr = params.context_buf_sf; + fmhaParams.attentionSinksPtr = params.attention_sinks; fmhaParams.packedMaskPtr = params.attention_packed_mask; if constexpr (std::is_same_v) { @@ -2220,6 +2228,7 @@ int AttentionOp::enqueueGeneration(EnqueueGenerationParams const& params, cud dispatch_params.relative_attention_bias_stride = relative_attention_bias_stride; dispatch_params.attention_mask = params.attention_mask; dispatch_params.attention_mask_stride = params.attention_mask_stride; + dispatch_params.attention_sinks = params.attention_sinks; dispatch_params.max_distance = max_distance; dispatch_params.cache_indir = params.cache_indir; dispatch_params.context_buf = mCpSize > 1 ? mhaOutput : params.context_buf; // @@ -2267,6 +2276,7 @@ int AttentionOp::enqueueGeneration(EnqueueGenerationParams const& params, cud dispatch_params.rotary_embedding_scale_type = mRotaryEmbeddingScaleType; dispatch_params.rotary_embedding_scale = mRotaryEmbeddingScale; dispatch_params.rotary_embedding_inv_freq_cache = params.rotary_inv_freq; + dispatch_params.rotary_embedding_cos_sin_cache = params.rotary_cos_sin; dispatch_params.rotary_embedding_short_m_scale = mRotaryEmbeddingShortMscale; dispatch_params.rotary_embedding_long_m_scale = mRotaryEmbeddingLongMscale; dispatch_params.rotary_embedding_max_positions = mRotaryEmbeddingMaxPositions; diff --git a/cpp/tensorrt_llm/common/attentionOp.h b/cpp/tensorrt_llm/common/attentionOp.h index fb71c06d57..cafb58f498 100644 --- a/cpp/tensorrt_llm/common/attentionOp.h +++ b/cpp/tensorrt_llm/common/attentionOp.h @@ -65,6 +65,8 @@ public: T const* qkv_bias = nullptr; // Attention mask input, which has shape of [batch_size, attention_mask_stride]. bool const* attention_mask = nullptr; + // Attention sinks with shape of [num_heads_q] float. + float const* attention_sinks = nullptr; // Rotary inv_freq cache buffer to avoid re-computing. float const* rotary_inv_freq = nullptr; // Rotary cos sin cache buffer to avoid re-computing. diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp index c10df82d54..53dc9e053a 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp @@ -27,6 +27,78 @@ namespace cutlass::gemm::collective::detail { +using namespace cute; + +typedef uint32_t __nv_fp4x8_storage_t; +typedef uint32_t __nv_bf16x2_storage_t; +typedef cutlass::uint128_t __nv_bf16x8_storage_t; + +constexpr int int4_group_size = 128; +constexpr int mxfp4_group_size = 32; + +inline __device__ unsigned prmt(unsigned hi, unsigned lo, unsigned select_code) +{ + unsigned res = 0; + + asm volatile( + "{\n" + "prmt.b32 %0, %1, %2, %3;\n" + "}\n" + : "=r"(res) + : "r"(lo), "r"(hi), "r"(select_code)); + + return res; +} + +__device__ __inline__ __nv_fp8x4_storage_t cvt_lut_bf16(unsigned const index) +{ + const __nv_fp8x4_storage_t h4b_lut = 0x03020100U; // 7654 + const __nv_fp8x4_storage_t l4b_lut = 0xFFFEFC00U; // 3210 + + __nv_fp8x4_storage_t lut_res = prmt(h4b_lut, l4b_lut, index); + + return lut_res; +} + +__device__ __inline__ __nv_bf16x8_storage_t psx_cvt_lut_prmt_fp4x8_to_bf16x8(const __nv_fp4x8_storage_t fp4x8) +{ + __nv_bf16x8_storage_t bf16x8_raw = {0, 0}; + __nv_bf16x2_storage_t* bf16x2_raw = reinterpret_cast<__nv_bf16x2_storage_t*>(&bf16x8_raw); + + unsigned zero_padding = 0x00000000U; + + unsigned h4b_em_fp4x4 = (fp4x8 & 0x77770000U) >> 16U; + unsigned l4b_em_fp4x4 = (fp4x8 & 0x00007777U); + + __nv_fp8x4_storage_t h4b_2to9_bits = cvt_lut_bf16(h4b_em_fp4x4); // 7654 + __nv_fp8x4_storage_t l4b_2to9_bits = cvt_lut_bf16(l4b_em_fp4x4); // 3210 + + bf16x2_raw[0] = prmt(zero_padding, l4b_2to9_bits, 0x1707U) >> 2U; // 1 0 + bf16x2_raw[1] = prmt(zero_padding, l4b_2to9_bits, 0x3727U) >> 2U; // 3 2 + bf16x2_raw[2] = prmt(h4b_2to9_bits, zero_padding, 0x5040U) >> 2U; // 5 4 + bf16x2_raw[3] = prmt(h4b_2to9_bits, zero_padding, 0x7060U) >> 2U; // 7 6 + + __nv_bf16x2_storage_t bf16x2_0to1_bits; + + __nv_fp8x4_storage_t h_fp8x2_0to1_bits = (fp4x8 & 0x0000C0C0U); // 3 1 + __nv_fp8x4_storage_t l_fp8x2_0to1_bits = (fp4x8 & 0x00000C0CU) << 4U; // 2 0 + + bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x4707U); // 1 0 + bf16x2_raw[0] = bf16x2_raw[0] | bf16x2_0to1_bits; + bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x5717U); // 3 2 + bf16x2_raw[1] = bf16x2_raw[1] | bf16x2_0to1_bits; + + h_fp8x2_0to1_bits = (fp4x8 & 0xC0C00000U); // 7 5 + l_fp8x2_0to1_bits = (fp4x8 & 0x0C0C0000U) << 4U; // 6 4 + + bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x6020U); // 5 4 + bf16x2_raw[2] = bf16x2_raw[2] | bf16x2_0to1_bits; + bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x7030U); // 7 6 + bf16x2_raw[3] = bf16x2_raw[3] | bf16x2_0to1_bits; + + return bf16x8_raw; +} + template struct MixedGroupedGemmInputUtils { @@ -46,6 +118,7 @@ private: static constexpr auto KernelConversionMode = Collective::KernelConversionMode; static constexpr auto ModeHasScales = Collective::ModeHasScales; static constexpr auto UseScaleLookupTable = Collective::UseScaleLookupTable; + static constexpr auto UseFP4ToBF16LookupTable = Collective::UseFP4ToBF16LookupTable; public: static constexpr auto elements_per_smem_scale() @@ -239,6 +312,27 @@ public: } } + // The core converter uses a lookup table to converts i4 -> 8 bit value. + template + CUTLASS_DEVICE static void fp4tobf16_lookup_table_convert( // Accept mutable temporaries + Tensor const& src, Tensor&& dst) + { + fp4tobf16_lookup_table_convert(src, dst); + } + + template + CUTLASS_DEVICE static void fp4tobf16_lookup_table_convert( + Tensor const& src, Tensor& dst) + { + + // View the input as reg + auto&& src_ = cute::recast<__nv_fp4x8_storage_t>(src)(0); + auto&& dst_ = cute::recast<__nv_bf16x8_storage_t>(dst)(0); + + dst_ = psx_cvt_lut_prmt_fp4x8_to_bf16x8(src_); + } + /// Utilities to dequantize A. template CUTLASS_DEVICE static void static_check_scale(Layout const& tensor) @@ -253,7 +347,6 @@ public: static_check_scale(flatten(Layout{})); } - // dequantize_A_kblock is here!!! template CUTLASS_DEVICE static void dequantize_A_kblock(Tensor const& tCrA_load, Tensor& tCrA_mma, cute::tuple& partitioned_extra_info, int const k_block) @@ -288,8 +381,6 @@ public: } else if constexpr (UseScaleLookupTable) { - // this path - constexpr int num_elements = decltype(size(src))::value; static_assert(is_same_v, "Lookup table only supports int4 being the quant type now."); @@ -424,7 +515,6 @@ public: static_assert(size_v == cosize_v); static_assert(size_v == cosize_v); using SrcType = typename EngineIn::value_type; - using DstType = typename EngineOut::value_type; Tensor src = tCrA_load(_, _, k_block); Tensor dst = tCrA_mma(_, _, k_block); @@ -441,7 +531,14 @@ public: CUTLASS_PRAGMA_UNROLL for (int i = 0; i < size<1>(dst_vm); ++i) { - LayoutAwareConvert(src_vm(_, i), dst_vm(_, i)); + if constexpr (UseFP4ToBF16LookupTable) + { + fp4tobf16_lookup_table_convert(src_vm(_, i), dst_vm(_, i)); + } + else + { + LayoutAwareConvert(src_vm(_, i), dst_vm(_, i)); + } } } diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp index 1ee109fd64..2332950629 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp @@ -30,37 +30,12 @@ #include "cute/atom/mma_atom.hpp" #include "cute/numeric/arithmetic_tuple.hpp" -#define GROUP_SIZE 128 - ///////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass::gemm::collective { using namespace cute; -template -CUTE_HOST_DEVICE void warpgroup_wait_() -{ -#if defined(CUTE_ARCH_MMA_SM90A_ENABLED) - cutlass::arch::synclog_emit_warpgroup_wait(__LINE__, N); - asm volatile("wgmma.wait_group.sync.aligned %0;\n" ::"n"(N) : "memory"); -#else - CUTE_INVALID_CONTROL_PATH("Attempting to use wgmma.wait_group without CUTE_ARCH_MMA_SM90A_ENABLED"); -#endif -} - -CUTLASS_DEVICE void warpgroup_wait_dispatch(int onthefly_count) -{ - switch (onthefly_count) - { - case 0: warpgroup_wait_<0>(); break; - case 4: warpgroup_wait_<4>(); break; - case 8: warpgroup_wait_<8>(); break; - case 12: warpgroup_wait_<12>(); break; - default: assert(false && "Invalid onthefly_count value"); - } -} - ///////////////////////////////////////////////////////////////////////////////////////////////// // WarpSpecialized Mainloop @@ -91,7 +66,7 @@ public: private: template friend struct detail::MixedGroupedGemmInputUtils; - using CollectiveType = CollectiveMma; using Utils = detail::MixedGroupedGemmInputUtils; @@ -146,6 +121,11 @@ public: static_assert(cutlass::gemm::detail::is_mn_major(), "Scale must be MN major [Col Major if A is scaled, Row Major if B is scaled]."); + static constexpr bool IsMXFP4 = cute::is_same_v; + // Group size 128 for int4 weights + // Group size 32 for mxfp4 weights + static constexpr int ScalingGroupSize = IsMXFP4 ? detail::mxfp4_group_size : detail::int4_group_size; + using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{})); using TiledMma = TiledMma_; using ElementAccumulator = typename TiledMma::ValTypeC; @@ -268,6 +248,8 @@ public: || KernelConversionMode == ConversionMode::ConvertAndScaleWithZero; static constexpr bool UseScaleLookupTable = KernelConversionMode == ConversionMode::ConvertAndScale && cutlass::detail::is_Array_v; + static constexpr bool UseFP4ToBF16LookupTable = KernelConversionMode == ConversionMode::ConvertAndScale + && cute::is_same_v && cute::is_same_v; static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{}); static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{}); static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB); @@ -705,7 +687,7 @@ public: { // The real scale_k that actually works // auto scale_k = K / mainloop_params.chunk_size; - auto scale_k = K / GROUP_SIZE; + auto scale_k = K / ScalingGroupSize; Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(make_shape(M, scale_k, L)); // (m,scale_k,l) Tensor gS_mkl = local_tile(mS_mkl, ScaleTileShape{}, make_coord(_, _)); // (BLK_M,BLK_Scale_K,m,scale_k,l) @@ -872,7 +854,6 @@ public: } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) { - // zero copy auto tZgZ = get<2>(extra_input_partitions); auto tZsZ = get<3>(extra_input_partitions); if (cute::elect_one_sync()) @@ -979,7 +960,8 @@ public: return make_tensor_like(tCsA(_, _, _, Int<0>{})); } }(); - Tensor tCsB = mma_warpgroup_slice.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE) + Tensor tCsB = mma_warpgroup_slice.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE) + // tCrB is just a view of the tensor tCsB Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K,PIPE) // @@ -1013,8 +995,8 @@ public: multiply_add fma; - constexpr int NumMMAsPerChunk = GROUP_SIZE / cute::get<0, 1>(tCsB.shape())(); - constexpr int NumChunksPerTileK = cute::size<1>(sA.shape())() / GROUP_SIZE; + constexpr int NumMMAsPerChunk = ScalingGroupSize / cute::get<0, 1>(tCsB.shape())(); + constexpr int NumChunksPerTileK = cute::size<1>(sA.shape())() / ScalingGroupSize; cute::array intermediate_array; constexpr int K_BLOCK_MAX = size<2>(tCrA_load); @@ -1045,8 +1027,6 @@ public: // src: tCrA_load, dst: tCrA_mma Utils::convert_A_kblock(tCrA_load, tCrA_mma, 0); - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - // Unroll the K mode manually to set scale D to 1 CUTLASS_PRAGMA_UNROLL for (int chunk_id = 0; chunk_id < NumChunksPerTileK; ++chunk_id) @@ -1079,10 +1059,11 @@ public: } } + warpgroup_wait<0>(); + CUTLASS_PRAGMA_UNROLL for (int chunk_id_ = 0; chunk_id_ < NumChunksPerTileK; ++chunk_id_) { - warpgroup_wait_dispatch((NumChunksPerTileK - chunk_id_ - 1) * NumMMAsPerChunk); warpgroup_fence_operand(intermediate_array[chunk_id_]); // Apply the group-wise scaling @@ -1129,7 +1110,6 @@ public: Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info, copy_partitions_extra_info, 1, smem_pipe_read.index()); - warpgroup_wait(); Utils::convert_A_kblock(tCrA_load, tCrA_mma, 0); } } @@ -1169,8 +1149,6 @@ public: tiled_mma.accumulate_ = GMMA::ScaleOut::One; warpgroup_commit_batch(); - warpgroup_wait(); // We have K_BLOCK_MAX - 1 GMMA instructions pending for this stage, - // so we can release prior barrier if (k_block == K_BLOCK_MAX - 1) { pipeline.consumer_release( @@ -1187,10 +1165,11 @@ public: { // The last k_block + warpgroup_wait<0>(); + CUTLASS_PRAGMA_UNROLL for (int chunk_id_ = 0; chunk_id_ < NumChunksPerTileK; ++chunk_id_) { - warpgroup_wait_dispatch((NumChunksPerTileK - chunk_id_ - 1) * NumMMAsPerChunk); warpgroup_fence_operand(intermediate_array[chunk_id_]); // Apply the group-wise scaling @@ -1257,7 +1236,6 @@ public: tiled_mma.accumulate_ = GMMA::ScaleOut::One; warpgroup_commit_batch(); - warpgroup_wait(); if (k_block == K_BLOCK_MAX - 1) { // release prior barrier @@ -1318,7 +1296,7 @@ public: smem_pipe_release.advance(k_tile_count); // Wait on all GMMAs to complete - warpgroup_wait<0>(); + // warpgroup_wait<0>(); for (int count = 0; count < prologue_mma_count; ++count) { @@ -1462,7 +1440,7 @@ public: { NonVoidElementScale const* ptr_S = nullptr; // auto scale_k = K / mainloop_params.chunk_size; - auto scale_k = K / GROUP_SIZE; + auto scale_k = K / ScalingGroupSize; Tensor tensor_scale = make_tensor( detail::get_logical_ptr(ptr_S), make_shape(M, scale_k, Int<1>{}), mainloop_params.dS[next_group]); cute::detail::fill_tma_gmem_shape_stride( @@ -1472,7 +1450,7 @@ public: { ElementZero const* ptr_Z = nullptr; // auto scale_k = K / mainloop_params.chunk_size; - auto scale_k = K / GROUP_SIZE; + auto scale_k = K / ScalingGroupSize; Tensor tensor_zero = make_tensor( detail::get_logical_ptr(ptr_Z), make_shape(M, scale_k, Int<1>{}), mainloop_params.dS[next_group]); cute::detail::fill_tma_gmem_shape_stride( diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu index 27d041618e..84710a9636 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu @@ -256,9 +256,9 @@ public: constexpr int SF_VEC_SIZE = 16; using PackedVec = PackedVec; PackedVec pack_val = *reinterpret_cast(&val); - auto sf_out = cvt_quant_to_fp4_get_sf_out_offset(std::nullopt, token_id, - m_access_id_in_token, std::nullopt, m_params.hidden_dim, - reinterpret_cast(m_params.scale_out), m_params.layout); + auto sf_out = cvt_quant_get_sf_out_offset(std::nullopt, token_id, m_access_id_in_token, + std::nullopt, m_params.hidden_dim / SF_VEC_SIZE, reinterpret_cast(m_params.scale_out), + m_params.layout); reinterpret_cast(m_params.quant_out)[m_access_id] = cvt_warp_fp16_to_fp4(pack_val, m_scale_factor, sf_out); } diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h index dbf45ebe1c..52487b25d4 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h @@ -132,7 +132,7 @@ struct AllReduceFusionParams float rms_eps; float* scale_factor; bool use_oneshot; - FP4QuantizationSFLayout layout = FP4QuantizationSFLayout::SWIZZLED; + QuantizationSFLayout layout = QuantizationSFLayout::SWIZZLED; cudaStream_t stream; AllReduceFusionPattern pattern; bool trigger_completion_at_end = true; diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu index 2176ba759f..c38abd9578 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu @@ -99,15 +99,15 @@ __device__ struct __attribute__((aligned(32))) LamportFlags uint32_t* offset_access_ptr; uint32_t* buffer_flags; - __device__ explicit LamportFlags(uint32_t* buffer_flags) + __device__ explicit LamportFlags(uint32_t* buffer_flags, uint32_t buffer_size) : offset_access_ptr(&buffer_flags[4]) , buffer_flags(buffer_flags) + , buffer_size(buffer_size) { uint4 flag = reinterpret_cast(buffer_flags)[0]; - buffer_size = flag.z; input_offset = flag.x * (buffer_size << 1U); clear_offset = flag.y * (buffer_size << 1U); - num_tokens_prev = flag.w; + num_tokens_prev = flag.z; } __device__ void cta_arrive() @@ -135,7 +135,7 @@ __device__ struct __attribute__((aligned(32))) LamportFlags uint4 flag = reinterpret_cast(buffer_flags)[0]; buffer_flags[0] = (flag.x + 1) % 3; buffer_flags[1] = (flag.y + 1) % 3; - buffer_flags[3] = num_tokens; + buffer_flags[2] = num_tokens; *(offset_access_ptr) = 0; } } @@ -144,7 +144,7 @@ __device__ struct __attribute__((aligned(32))) LamportFlags template __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_ptrs, T* mcast_ptr, int num_tokens, - int buffer_M, int token_dim, int rank, uint32_t* buffer_flags, bool wait_for_results) + int buffer_M, int token_dim, int rank, uint32_t buffer_size, uint32_t* buffer_flags, bool wait_for_results) { int elt = blockIdx.y * blockDim.x + threadIdx.x; if (elt >= token_dim) @@ -155,7 +155,7 @@ __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_ cudaGridDependencySynchronize(); #endif - LamportFlags flags(buffer_flags); + LamportFlags flags(buffer_flags, buffer_size); // Capture the number of tokens in previous iteration so that we can properly clear the buffer // The scatter stage will use the buffer in WORLD_SIZE granularity, thus we need to round up @@ -217,15 +217,17 @@ __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_ #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) cudaTriggerProgrammaticLaunchCompletion(); #endif - - // Similarly clear broadcast buffer here - for (int clr_tok = 0; clr_tok < clr_toks_cta; clr_tok++) + if (elt < token_dim) { - uint32_t clr_token_idx = token + clr_tok * gridDim.x; - if (clr_token_idx < buffer_M) + // Similarly clear broadcast buffer here + for (int clr_tok = 0; clr_tok < clr_toks_cta; clr_tok++) { - input_ptrs[rank][flags.clear_offset + buffer_M * token_dim + clr_token_idx * token_dim + elt] - = fromFloat(-0.f); + uint32_t clr_token_idx = token + clr_tok * gridDim.x; + if (clr_token_idx < buffer_M) + { + input_ptrs[rank][flags.clear_offset + buffer_M * token_dim + clr_token_idx * token_dim + elt] + = fromFloat(-0.f); + } } } @@ -240,20 +242,24 @@ __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_ // blockDim.x / ELTS_PER_LOAD should be at least the size of a warp (32) if (threadIdx.x < (blockDim.x / ELTS_PER_LOAD)) { - uint64_t current_pos = blockIdx.x * token_dim + blockIdx.y * blockDim.x + threadIdx.x * ELTS_PER_LOAD; + uint64_t elt_load_offset = blockIdx.y * blockDim.x + threadIdx.x * ELTS_PER_LOAD; + if (elt_load_offset < token_dim) + { + uint64_t current_pos = blockIdx.x * token_dim + elt_load_offset; - void* lamport_ptr = (void*) &input_ptrs[rank][flags.input_offset + buffer_M * token_dim + current_pos]; - // We have 2 assumptions here: - // 1. The write is atomic in 8B granularity -> Each buffer in the buffer group should be aligned to 8B - // 2. The num_token * token_dim is divisible by ELTS_PER_LOAD (4 for BF16 and 2 for FP32) - float2 val = loadfloat2(lamport_ptr); - while (isNegZero(*(T*) &val)) - { - val = loadfloat2(lamport_ptr); - } - if (output_ptr) - { - *((float2*) &output_ptr[current_pos]) = val; + void* lamport_ptr = (void*) &input_ptrs[rank][flags.input_offset + buffer_M * token_dim + current_pos]; + // We have 2 assumptions here: + // 1. The write is atomic in 8B granularity -> Each buffer in the buffer group should be aligned to 8B + // 2. The num_token * token_dim is divisible by ELTS_PER_LOAD (4 for BF16 and 2 for FP32) + float2 val = loadfloat2(lamport_ptr); + while (isNegZero(*(T*) &val)) + { + val = loadfloat2(lamport_ptr); + } + if (output_ptr) + { + *((float2*) &output_ptr[current_pos]) = val; + } } } @@ -263,10 +269,11 @@ __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_ } #define LAUNCH_ALL_REDUCE_KERNEL(WORLD_SIZE, T) \ - TLLM_CUDA_CHECK(cudaLaunchKernelEx(&config, &twoshot_allreduce_kernel, \ - reinterpret_cast(params.output), reinterpret_cast(params.input), \ - reinterpret_cast(params.buffer_ptrs_dev), (T*) params.multicast_ptr, params.num_tokens, params.buffer_M, \ - params.token_dim, params.rank, reinterpret_cast(params.buffer_flags), params.wait_for_results)); + TLLM_CUDA_CHECK( \ + cudaLaunchKernelEx(&config, &twoshot_allreduce_kernel, reinterpret_cast(params.output), \ + reinterpret_cast(params.input), reinterpret_cast(params.buffer_ptrs_dev), \ + (T*) params.multicast_ptr, params.num_tokens, params.buffer_M, params.token_dim, params.rank, \ + params.buffer_size, reinterpret_cast(params.buffer_flags), params.wait_for_results)); void twoshot_allreduce_op(AllReduceParams const& params) { @@ -369,20 +376,33 @@ inline __device__ T add(T a, T b) } #define FINAL_MASK 0xffffffff +#define WARP_SIZE 32 template __inline__ __device__ T warpReduceSum(T val) { + // Get the actual number of active threads in this warp + int active_warp_size = min(WARP_SIZE, blockDim.x - (threadIdx.x & ~(WARP_SIZE - 1))); + unsigned int mask = (1U << active_warp_size) - 1; + #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) - val = add(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32)); //__shfl_sync bf16 return float when sm < 80 + for (int offset = 16; offset > 0; offset >>= 1) + { + if (offset < active_warp_size) + { + val = add(val, __shfl_xor_sync(mask, val, offset, WARP_SIZE)); + } + } return val; } inline __device__ float block_reduce_sum(float val) { - __shared__ float smem[32]; - int lane_id = threadIdx.x % 32, warp_id = threadIdx.x / 32, warp_num = blockDim.x / 32; + __shared__ float smem[WARP_SIZE]; + int lane_id = threadIdx.x % WARP_SIZE; + int warp_id = threadIdx.x / WARP_SIZE; + int warp_num = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE; // Ceiling division to include partial warps + val = warpReduceSum(val); if (lane_id == 0) { @@ -391,6 +411,7 @@ inline __device__ float block_reduce_sum(float val) __syncthreads(); val = lane_id < warp_num ? smem[lane_id] : 0.f; val = warpReduceSum(val); + return val; } @@ -410,7 +431,7 @@ __device__ float4 loadfloat4(void const* ptr) template __global__ void __launch_bounds__(128, 1) RMSNorm(T_IN* input_plus_residual, T_OUT* output_norm, T_IN const* buffer_input, T_IN const* gamma, float epsilon, - T_IN const* residual, int batch_size, uint32_t* buffer_flags) + T_IN const* residual, int batch_size, uint32_t buffer_size, uint32_t* buffer_flags) { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) static bool const LAMPORT = true; @@ -433,7 +454,7 @@ __global__ void __launch_bounds__(128, 1) int offsets[NUM_INPUTS][DIM / (1 * ELTS_PER_THREAD * NUM_THREADS)]; - LamportFlags flags(buffer_flags); + LamportFlags flags(buffer_flags, buffer_size); T_IN const* input = &buffer_input[flags.input_offset + flags.buffer_size]; cudaTriggerProgrammaticLaunchCompletion(); @@ -598,16 +619,15 @@ __global__ void __launch_bounds__(128, 1) #endif } -template +template void twoshot_rmsnorm(T* prenorm_output, T* normed_output, T const* input, T const* gamma, double epsilon, - T const* residual, uint32_t* buffer_flags, int batch, cudaStream_t stream) + T const* residual, uint32_t buffer_size, uint32_t* buffer_flags, int batch, cudaStream_t stream) { // input to rmsnorm is the buffer in the twoshot ar // We should use prenorm output to determine the actual used size float _epsilon{static_cast(epsilon)}; - static constexpr int NUM_THREADS = 128; static constexpr int CGA_THREADS = NUM_THREADS; constexpr int iters = H_DIM / CGA_THREADS; @@ -628,28 +648,34 @@ void twoshot_rmsnorm(T* prenorm_output, T* normed_output, T const* input, T cons &RMSNorm, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem_size); config.dynamicSmemBytes = shmem_size; TLLM_CUDA_CHECK(cudaLaunchKernelEx(&config, &RMSNorm, prenorm_output, normed_output, - input, gamma, _epsilon, residual, batch, buffer_flags)); + input, gamma, _epsilon, residual, batch, buffer_size, buffer_flags)); } -#define LAUNCH_RMSNORM_KERNEL(T, H_DIM) \ - twoshot_rmsnorm(static_cast(params.residual_output), static_cast(params.output), \ +#define LAUNCH_RMSNORM_KERNEL(T, H_DIM, NUM_THREADS) \ + twoshot_rmsnorm(static_cast(params.residual_output), static_cast(params.output), \ static_cast(params.input), static_cast(params.gamma), params.epsilon, \ - static_cast(params.residual), params.buffer_flags, params.batch, params.stream) + static_cast(params.residual), params.buffer_size, params.buffer_flags, params.batch, params.stream) void twoshot_rmsnorm_op(RMSNormParams const& params) { auto dtype = params.dtype; + +#define CASE_DISPATCH_RMSNORM(T, H_DIM, NUM_THREADS) \ + case H_DIM: LAUNCH_RMSNORM_KERNEL(T, H_DIM, NUM_THREADS); break; + +#define TYPE_DISPATCH_RMSNORM(T) \ + CASE_DISPATCH_RMSNORM(T, 2048, 128) \ + CASE_DISPATCH_RMSNORM(T, 2880, 120) \ + CASE_DISPATCH_RMSNORM(T, 4096, 128) \ + CASE_DISPATCH_RMSNORM(T, 5120, 128) \ + CASE_DISPATCH_RMSNORM(T, 7168, 128) \ + CASE_DISPATCH_RMSNORM(T, 8192, 128) + if (dtype == nvinfer1::DataType::kFLOAT) { switch (params.hidden_dim) { - case 2048: LAUNCH_RMSNORM_KERNEL(float, 2048); break; - case 4096: LAUNCH_RMSNORM_KERNEL(float, 4096); break; - // Llama-4 Hidden Dimension - case 5120: LAUNCH_RMSNORM_KERNEL(float, 5120); break; - // DeepSeek Hidden Dimension - case 7168: LAUNCH_RMSNORM_KERNEL(float, 7168); break; - case 8192: LAUNCH_RMSNORM_KERNEL(float, 8192); break; + TYPE_DISPATCH_RMSNORM(float); default: TLLM_CHECK_WITH_INFO(false, "[MNNVL TwoShot RMSNorm]: unsupported hidden_dim."); } } @@ -657,13 +683,7 @@ void twoshot_rmsnorm_op(RMSNormParams const& params) { switch (params.hidden_dim) { - case 2048: LAUNCH_RMSNORM_KERNEL(__nv_bfloat16, 2048); break; - case 4096: LAUNCH_RMSNORM_KERNEL(__nv_bfloat16, 4096); break; - // Llama-4 Hidden Dimension - case 5120: LAUNCH_RMSNORM_KERNEL(__nv_bfloat16, 5120); break; - // DeepSeek Hidden Dimension - case 7168: LAUNCH_RMSNORM_KERNEL(__nv_bfloat16, 7168); break; - case 8192: LAUNCH_RMSNORM_KERNEL(__nv_bfloat16, 8192); break; + TYPE_DISPATCH_RMSNORM(__nv_bfloat16); default: TLLM_CHECK_WITH_INFO(false, "[MNNVL TwoShot RMSNorm]: unsupported hidden_dim."); } } @@ -671,13 +691,7 @@ void twoshot_rmsnorm_op(RMSNormParams const& params) { switch (params.hidden_dim) { - case 2048: LAUNCH_RMSNORM_KERNEL(__nv_half, 2048); break; - case 4096: LAUNCH_RMSNORM_KERNEL(__nv_half, 4096); break; - // Llama-4 Hidden Dimension - case 5120: LAUNCH_RMSNORM_KERNEL(__nv_half, 5120); break; - // DeepSeek Hidden Dimension - case 7168: LAUNCH_RMSNORM_KERNEL(__nv_half, 7168); break; - case 8192: LAUNCH_RMSNORM_KERNEL(__nv_half, 8192); break; + TYPE_DISPATCH_RMSNORM(__nv_half); default: TLLM_CHECK_WITH_INFO(false, "[MNNVL TwoShot RMSNorm]: unsupported hidden_dim."); } } @@ -685,6 +699,8 @@ void twoshot_rmsnorm_op(RMSNormParams const& params) { TLLM_CHECK_WITH_INFO(false, "[MNNVL TwoShot RMSNorm]: unsupported dtype."); } +#undef TYPE_DISPATCH_RMSNORM +#undef CASE_DISPATCH_RMSNORM } } // namespace tensorrt_llm::kernels::mnnvl diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.h index ccca256b5a..3a0fb753db 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.h @@ -30,6 +30,7 @@ struct AllReduceParams int buffer_M; int num_tokens; int token_dim; + uint32_t buffer_size; void** buffer_ptrs_dev; void* multicast_ptr; void* buffer_flags; @@ -50,6 +51,7 @@ struct RMSNormParams void const* gamma; double epsilon; void* residual; + uint32_t buffer_size; uint32_t* buffer_flags; int batch; int hidden_dim; diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu index 577f4b5ff4..7bc9e326fb 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu @@ -150,8 +150,8 @@ __device__ __forceinline__ void fused_op( constexpr int SF_VEC_SIZE = 16; using PackedVec = PackedVec; PackedVec pack_val = *reinterpret_cast(&norm_val); - auto sf_out = cvt_quant_to_fp4_get_sf_out_offset(std::nullopt /* batchIdx */, - token_id, access_id_in_token, std::nullopt /* numRows */, params.hidden_dim, + auto sf_out = cvt_quant_get_sf_out_offset(std::nullopt /* batchIdx */, token_id, + access_id_in_token, std::nullopt /* numRows */, params.hidden_dim / SF_VEC_SIZE, reinterpret_cast(params.scale_out), params.layout); reinterpret_cast(params.quant_out)[access_id] = cvt_warp_fp16_to_fp4(pack_val, *params.scale_factor, sf_out); diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h index 9ebc7de650..4a35d14bf0 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h @@ -55,7 +55,7 @@ struct AllReduceFusionParams void* rms_gamma; float rms_eps; float* scale_factor; - FP4QuantizationSFLayout layout = FP4QuantizationSFLayout::SWIZZLED; + QuantizationSFLayout layout = QuantizationSFLayout::SWIZZLED; cudaStream_t stream; }; diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 81208594d0..0000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d5bb139b12206a563daec9fa473dda422319bde5ae5f965d37cf5ca67d325c49 -size 1005546 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 7086ad9f48..0000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c4357a935656d47414a459939720b66311c67213f450168715e1cb0238653768 -size 1066324 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp index 0acae9aa71..2ae91e52cd 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a0671e7cbbed9f51dc0c47e4b970e2f72067d629ff6562c9d65f9cd55c68578 -size 361861 +oid sha256:c709dce149c0f4500539e495c90d1da2d86cec28c4187ee9494b015642e158cf +size 363441 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp index 4cb6bcd1c1..bce0c66bcf 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ec9817bebb07483ce29d8d91c45d35c2c05f0101bfa70146fba5a6576a6b825 -size 1091614 +oid sha256:b9170581da010aca67f4bafd9f6f59aaaf5fd1958a1fdd336aa208146599ac06 +size 1094770 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp index 470904148a..caa735d572 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0540cdb398818ec54a60c34b462c158e169347db73d244d633669d74211696ba -size 1467312 +oid sha256:2147a246067f7ea74ca382fbc8c02a26332479e5205ecfbe08fb84161a3a87ec +size 1483888 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp index 281985341d..0b584163a8 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69bdfba64f1faff30ed8389a28b7b9ef37c0d180b1df643722b280011c8f74e8 -size 692990 +oid sha256:279bd48b8ac53690bb4e37dffbe9060428db80c1417ff29c6f4d4a10ab35a7c9 +size 700094 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp index 8b8738474d..496df695fc 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8173308813999ab64ba8236016b23fbfd3f3f1501f61290bf71ea027ead2920 -size 642456 +oid sha256:db5d186ce70d7a94cae2b6619b3449ca557903944beba1ee738d2ee425792d74 +size 652718 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp index 6ca952af64..c6692932cd 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f41ae066b01b2a9c3b5165535f743461a9a1d559f6fcd0a00a04c554f8a50962 -size 414757 +oid sha256:089a98cf8ab0bbd7530e69821c42220ea02578b740bff62a3e6e33de45209114 +size 416335 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp index 1a973c5d2e..555f626864 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab0be8e667d459e13135f96469613f1c095e47187b24e5d40c7c57583351a076 -size 1194236 +oid sha256:1f0cc486ec5e9c1720f495a2a5e7c26d42e737694d307d4746a08b6ead5cc225 +size 1197394 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp index 8faf85254d..b5884bba55 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03d86280f76994e2e01d43747cb5c811496b8340d031ebb0c3bdd46437422994 -size 1654394 +oid sha256:398965e34c1a4c747b42d8836c04934daaa43903b7931586ed12120e17a61f76 +size 1672548 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp index 53f3032a30..696620f879 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35c5715bcb1a16c343f3a28be105fb6fee1bbca24cf832f71a7d0f20cf9a0b3e -size 365015 +oid sha256:77cbd7d45164d24be73e021bc0a8745b4f021e4369a254e216ee00b36d3c7263 +size 366593 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp index 89a4eaa580..22a4ff75bf 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3335a8d4b2c0ca63f006c3f957d57aa3f808ef06d4adda322c311a333286d84 +oid sha256:3a3f74fbe72ef54b9c028d957353c1ecbff1d20bcc9619ff17ee37471934a2ab size 1126352 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp index 9cb2eb33c2..e0b9335b45 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdc0bf099862d352b3b765e117437240a82e4749d3efd104881647dd4ea14562 +oid sha256:b3af082c6742f385d0d2c96489ff1de314458eb992d6d5a251c737f8ec912e79 size 644092 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp index 153555cbe4..ec999849fa 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccd938df8f78af4eae306c6e9e669599c2baf6f095f956318470063c560fbd3c -size 1091610 +oid sha256:8e26f3b8cc173301b3cf07ba1ca7893b6f140432410b0b298361ecff597604c2 +size 1095556 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp index cab205493a..284e084f3d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce4d35ab4c7b65476f0dcec635db1791fcb718afd6b3531338712f5b2bc9aa84 -size 1460204 +oid sha256:32220d11bc3542e9edcc36d51b4866bf40044213114d7e237e003afc1fc7c464 +size 1478358 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp index ab21a448f5..69a3f4789c 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d088ce37b21d335ba1f92034cf97f78fc968d7fecaa0c4f9ec83a0d5165f1d99 +oid sha256:3ee5ae75df4866d848e90616562345d3740b17b68c90f06329dc074dba5217a9 size 482709 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm89.cubin.cpp index 2fa6ba246e..c19635d688 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40653ec672098e2cb1f94c473fa67852efcf6b49a6e8109e4fcf39422281acb4 +oid sha256:817ae5c1eb8a8c6f22a76ab0b88075fd3391d06abb7dd6d9ab51206b809cd69d size 657930 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp index ebdb0563ef..a625def240 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96348957990518db6f51af7c681a71e625dede568cc8f8303dd2de8ad09bfc28 +oid sha256:680734da0abb1c3029dce32e892687f649c4219f66574acb15ab88471f508263 size 677218 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp index 7cd5b267e0..1691a77e1f 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4687df80ac2fa9454b0564b0a80d78cfaedc2c7796c8f3a1010dd7ebbf722c83 +oid sha256:c27e871dd680022920081c30c5e239613e53b42129680fdb1d17668b5c5ddd9a size 369401 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp index f4da9b9d86..6e7098d6c7 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8b9985065f5f2c62b74c05f8eed02b1909c96656b26fbd7779cc57a2146b037 -size 947140 +oid sha256:3e1ecaa635067924b692b665241d86e1d8c1d60a19290de7adde1ff2ca7dbeb0 +size 956612 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp index 8ffdb6589d..c38c3b29fd 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23599e63b07ad966df921daf3cb97a9ed5cde27eeda0fd96ba5abd835b48f89a -size 590779 +oid sha256:d3018c622303f89c6f22f037ec99eaeaeea9cfe8911e22463b48a22c13116805 +size 592357 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp index 1153714c7e..5d286a73e5 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd1c452565583b20913d835de9b14c2f19c0cc431bc926ea6c92295362a85bca -size 1813864 +oid sha256:a7a381f2855236f418a40124a5254401c95001d5e15c074a704e22cc7ed89aa2 +size 1818600 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp index b6383dcbd5..5290f97cfb 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b20de2c6bb3081564ddfbf7ece80fb2c17e66f4e7ff0e0969da4e4655e90d1ec -size 2407418 +oid sha256:9bb49ace4dedc4faa3de2b9c22e09db0f3990129ce7ab4afb6419c38a5d48a16 +size 2427152 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp index 3713748af5..cb3d89f070 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33a0e8bb2391128e688e5c6356f09a5ed189ce5c1bcdeef4efc0ce0415dc2849 -size 555245 +oid sha256:9769d7cb9754718798be515c84c45ff48e43322573f3f12e31c2e42e99d8dbd4 +size 557613 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp index 795d4d68fc..de925119b3 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b014f41b1cfdf6ed2729778841213a36440191eb3c087346a02c21510bd3f0e -size 665794 +oid sha256:134f4a73e0e6b02b717319ec49e3b3ea0a585cad385a1f300e6c5761f12de9d7 +size 671320 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp index 5c8dbe22b2..64bb52e0df 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd77afeb7dcd1ff8d6be80788b20e92e4fbc8c3026ba12d1d522c99316754a7c -size 1740442 +oid sha256:7935b0f053a79a7e620c0efe274fa5b4c840fc9c6e439a381c4d380446e1cb68 +size 1744388 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp index ee1a46c9bc..87d96af432 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b674707d02aac297b66d523de8b11618ca1598c49eeaf7ce9b1c9d516ce95c4b -size 2247958 +oid sha256:74ecbbaa19b2efe97a3b12c488f0e03c2102f16c460239df4bfc19976fc4365e +size 2266902 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp index 349c2efdfe..15ad1d62a9 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7556f88488e05ee669e763b839afa1b7690060cfa9d8482d419c0ca336df9352 +oid sha256:813265d25709bd2d39982efbaf092c9163b124bd990fccab505b3c22134522aa size 595585 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp index 2ccc55f144..4e62255a62 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac9d879aa0c70967bb3a79cd7034998baf43a544c0dd4444ebddeb76e78df5ae +oid sha256:dd36195c01bf7c2a2013d5f31d2e74c2579c471385d7b45be7e35ea2f0652608 size 908162 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp index ec1ef8aae9..10ee7b3d8c 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e781c0278fc46142f578ae51bfeb38767e89d9c25b92023215948f99dd1d3ed +oid sha256:31d4d6dca68c4632d1f435e9179582cfe2ad7a75ee0f7625ee67b0044c914f10 size 1371512 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp index d904de0acb..407d34a655 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d608e9e3ec460d2a38f43067a7d7a2dd408e068db690806bbafb11007e175336 +oid sha256:6570d3ee7b651dec797e82b31eb21fd3261c6e2639fb7c9b157f251bf98bb3bf size 1419662 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp index 798e8482b4..d6b829a9a0 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c1e1d300866c6425c2495e550230051debdca0a7eb85874ae33c0c2de8a81cb +oid sha256:88b972677c5436b90fe85870278e3b23d6f709608f99295bddf0be3861d95d1a size 1419662 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp index bbcce09e72..7cac9a8325 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:132d83639e34af1b431abdcb3f09542d0389030b85752e18a3ae221ead7d24a3 +oid sha256:d975f605d62c3070d6cf72f6114d98642c520e66989ed2d2845c3213e921ebf7 size 1965880 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp index 83287a0376..9dd7d6bf8e 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a96710f6c691580c2363c187a75fd436f5e6be732810a1a45182ce72dc52d1e +oid sha256:ef5a2728cbd3241f45f3d8285c91a818e11b2a9fedf322f343a9461d31a6ad30 size 1380182 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp index 0062377934..1b6d6cddf5 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6339f008f451d030aa36a6b3fac7179e7534f7f2474d641fa0ebfbf487074e7 +oid sha256:16b5f3d3f8760dabc0849217cf11edf18d19896dda475a5fc233bbfd444faf33 size 1401494 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp index 0d719af97a..90decb8793 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57ebcae2b70fc28881f2b3969868d64c203ef4a9cbc9588a9e28051c5f5b6849 +oid sha256:cbacb235f39adaeabd68e2fc46c51aac6ca26cdf96293a6a7eb60b5be40640ef size 1401494 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp index ceab132d42..5628ced1f3 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e2a4ce1b944feb2b3ed535943089a2d5968bf523b149885df78f7fa4bd7e835 +oid sha256:e6f3e068435339a64d47673f8018b66c202f6259d68e0a97a4a30acb7505a7fd size 1935872 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp index 2780675d9d..552a78df4f 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5d456b30f89ad05ba5b852fabcffb3f8269913d83ef8c0e4e319f2243dee54d +oid sha256:7c2d7ab0692de5405b26d19a0c57d720285366ac12a8550bbabca1613cce7f0c size 305897 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp index 2aa3fd4b0a..ca2d2a604d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85593d3c2fecb6842a72952c6dcbde19a70e6b26245829d279ca50bb391eb636 +oid sha256:91a26adfddc0bcaf8b42249f59f1a0b9f74be0f82c7378fe4b56f3a2fa3d4bf1 size 290109 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp index b050acbb5a..da475b4a2d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69cd61bd8334d2109067ef0460a91b8dba4c2cb07392eb636d72d025ccb15bf9 +oid sha256:6ef79c9e2e2d8bba55d7803dc8dc147b5d8babc29e906a43407a8722bbd8d939 size 498507 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp index e741d50f4c..09b401a003 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0427b7729ce3cfa652a4595d04f936a947febec8f2c96ce33eed7cbaaa05613e +oid sha256:0eef025f8e8581868b02bcea37ff225afebcbb2966450fb29fb0e32ac54eccd4 size 668214 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp index eee064e280..0c6a45eacc 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:321bcd81b8965c8dfc08682f775508ae18e3ff711490ee8dff5fe56c20f74843 +oid sha256:abb2857ffb85cc36aae90ebb674635dffee2b2c5f7ad1ea81bb8002b65d5a0f8 size 711628 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp index 33f4d9cab3..9ecb64bd23 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa77d3789c0ca314689125ec303a8af76554120a708a4b63395c69b7aad07f04 +oid sha256:49a3661535314b139e2794fe16f6f3e0a8d45742b68ea59ba99a9113068adf2c size 752698 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp index 3138343090..d836cccd03 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa35aa70d0fa304c776c076a1a189d32a054d3f696dac5d99018085d1108c73b +oid sha256:d76fb6c4f8bb2de687bc5f9f275389356934119c1f0db9983dcf0ec7b68c6197 size 748726 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp index ca7815f710..79e1e96e9b 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1a702d456b5acf279487dd810e3e33efdd1c7bd82530ceb5a32ad30ec30396c +oid sha256:be8ee89f4489c430d0ff6e9c6cf4e07379ac05abf468d47e34e084ad594b2037 size 946060 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp index 8bb9403c51..3c8b2528fc 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:558aa7d42de329c49361c94c4baef16738304b21b6adbe675d77c7819ef37660 +oid sha256:aa4be8ca2dd52e56c9a6af76b90ac353d217fad5fa931b21129ac5a811b5283a size 489823 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp index 0754f76695..22fce024ea 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b5baa6048e6c33e74c6d343eb7c76252ff2e534fe467b3189af12b5d64af37c +oid sha256:cb0482b768a40bc7f8a86fa23a84bab62fb82c205f3237ff60becda50cbafc90 size 489823 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp index 68de134acb..c02b557e7f 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e17cb191ad092e6db255ea503e49ea883ed56322fc58ed8d68710f6687376c1f +oid sha256:95b1796f4e7c905eca82ed3691427025f68e765797440b962b0114a5ab32b1d7 size 500083 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp index 3ebcc110ec..cbc081aae2 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfca5660a931e08941347f7a0aefa82c214940e8eaa6b6d89cfded621f34a490 +oid sha256:2d9f13977fc865e716f1f35dfdb222a38000b224ff7394134230ed5c88119947 size 496125 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp index c0c882331e..cc613cc08d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fffd2cd799953808034d7e7b89a57d4fede24db124bfb0d3938188177acbdfeb +oid sha256:007e32a06fcac853159dc5786940447281c57ba70406d38beb6f089fd037053d size 182023 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp index 458aa250b4..d8ba524113 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19ada3a5d449542f103077db8d193bc2293a8f48ccee201e366473964287314c +oid sha256:26241ea5909395116e1b1a0f19cadc448886f6a6ab2b3ba76c092b67cd0148f0 size 182023 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp index 65edc3e52a..0206f71981 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9c32124cd708aab7da30637d85437da0af9bf2157d163c19c6fe14498698cda +oid sha256:86e4ca60a459117c5e701631fbd3c67ca66e81d177c394c1fc9ad3b66396e69a size 661096 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp index 8213475b06..3444d759b7 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f248fd42759509c61d20f912ae74dc3a85448a9c8386370ea92492ed9031e80 +oid sha256:770db1f4ec1c2d3c25767593b60cb095e49f7a6eb7abe054bbdec6e72db97f8d size 672936 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp index 75bd11ff6e..b99affa020 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:190fd946ddc7e1b5e9ca2172ec1de39c6288829773d9ce29fe98374256eff566 +oid sha256:0b6428cae2d0c8c813925be9589c94771098cfe5a6d0ff2036104d3e36384b81 size 721900 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp index ed5e241d9e..e93db30f53 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7cd5976c836bcd75c0cadfe968050ac60bf89b93df021ad6c1681e159c497c5 +oid sha256:36c6932301fe3dc29631c28fcb8cb6b08652103bc7a36fd74a03a8189a1c77e4 size 717928 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp index 44ce0c307f..8f42d5a276 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c536d725e1d9ebd2cb836dfe3993edcc81101534db6b7f1943c8a9443838bf4 +oid sha256:d858f6dcaf3f49fb3fa18b1c8c20ee1b933e2c8ddd1a429c8d3b5b4d269fb875 size 927892 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp index 0216db308c..0cb2a13410 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5907da5a2f68c010d44bbbd0d780e097f9625be15b2f85e8dd1f00dd4c31ff9 +oid sha256:7dc92ab65ed0fc5f9d821f52a396a6d55ea9ae37e080eac7ff9e9c14eae741e7 size 631890 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp index c63b37264a..648e3acb00 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cf14c71134a89ed6ffc83c0b7db06ed10e22b55294dc15ddf7f016427f01033 +oid sha256:d66606a37cfe8eb78ccc3f548a231f770df9f46e70f6d3ba22fb8abe6216480e size 159919 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp index 7d1ac80867..6028cc1f32 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2b83c70dbc8ab0b3695dab3f4d2069b7ee7119e9140d7860b8c19f59a498589 +oid sha256:b723b296cff04602f64a5da9928e6f9b6a03c5cc608ba9ef7d8055f23f1f4ea2 size 159919 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp index 4041bfc97a..b1ee67b880 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc8369f5701dceea91d429a713ddcbb4ecb0ad08d3c9042688557ead5f00e9da +oid sha256:d40578a5684262cd8136705367e2c98493ea9b9fcfc123c7efa3ead14017b5b8 size 483493 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp index f0afe3fcf1..4ce3d2dba5 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e9fffff2d13d49613e5f9334a010ca9bcde43b3bb55a792fd97fe2c867760dc +oid sha256:60cc82b9d11c53392de91a7c4c097263c20a56f9b346278c7c9af12ef2bb5fbf size 496123 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_output_bf16_sm89.cubin.cpp index 03a4b33cef..d24465ed9c 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd3041ba5a52263f7f02d64f1911c50e346151bf529e865c1abf22583abd3e21 +oid sha256:8f685b6b2a0a573953f31fad89fa37e949361db245de69c0c06ce0bbb14eacef size 443285 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_sm89.cubin.cpp index 6984f3c170..dc49a30627 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12482099b086249163085e6e3421a61f6e304f865aaf56dd15382614be5e48e7 +oid sha256:834f0f3601c589893a21b957be2864df594f96b34b2cfd6018ada8319986aa21 size 441683 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_output_bf16_sm89.cubin.cpp index 2bb4cc2582..4763a29923 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfea1ea1627eaef7b614db08bad00bda8b611c8e466c858e050c0ce2aee2eafb +oid sha256:3d81a070e7ed49f1e1a322d38a757a3505186cf5cbded99814e950e07229a46a size 298049 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_sm89.cubin.cpp index 7e76c5e13d..c8587a81d3 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f828600699faa3a0474085cbbe88d2e0ac7c8e056c976b81a882c3a72682e527 +oid sha256:b9de5bc49d888699da1880d24ccf6a9cb6c0049d7a244d1ae9ab64b7365ecd5a size 296445 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_output_bf16_sm89.cubin.cpp index 1c1f7bdc42..7d299b8705 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d4b297922065ecb79b4a1278d048b253b57601d011fc5833a32f9fc1b78e58e +oid sha256:e30ed0df4b0d0b1da1ace5831dc0a7a526e04001b25860f862345c78acff5a43 size 427485 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp index 68394c07c1..47eeb69632 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fd5305445c9856fbd5d9dfaffdd7f87b9014638f33fb63fb2cb4fce9893b20b +oid sha256:030015dc1811e3dc2ae36ed770f51063a3f46deae42ead5e1523c977b438a133 size 425883 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_64_sm80.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_64_sm80.cubin.cpp index 51778ad0e9..1a5b22eed8 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_64_sm80.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_64_sm80.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b7fee97097f799830df2bcb1c782c7ea9018243cbd5cd0e0f47ec299b49db79 +oid sha256:6921a204892e1336cef2a308be38855f3c888e56bd6a16752d2806aa9e93c431 size 1524634 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp index 537871847d..834fa7d1c0 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ac2f9270988bc02329ce11ef3413395b2b8cdc55fcf4911d170536c6e618317 -size 403697 +oid sha256:200df98fb2fcc734e8fc012c98c5d78c2061e5718eef6ffd50c2358a3d664197 +size 406065 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp index 6bf814ac8a..e085961e98 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1234cf31a3a6b84ed25fa0ad6c4df9b53f673f6bac2f639a66086ba50f8717ba -size 1120818 +oid sha256:430194fe07e526ad01a1e0fb43273b240c269215b132c9af248ba386dcbda23e +size 1124766 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp index 3bebbebcf1..2d56be2925 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fff300932a16d30844e317ace515a178f159c483e436f6955983b96c5c424c6 -size 1549402 +oid sha256:53a07904a7bfbf82380c96af99c5e24bc86f77906c5d6fdc85ef9720639d76d2 +size 1569136 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp index ef64a37682..6d074921cd 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed10767ec913d314936fc5dbd1fd70c5381a622bf3fcf1590f837da6d3285bca -size 723774 +oid sha256:1ce4d27b11fee3e5f6489510b55613177e174660b6c7a6fb4efed862b62c50d7 +size 731668 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp index d0bc52f131..a626899316 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e7a7a9653a9c4e4e9b0514fc1d70abbb4521c7edbede52568d17d0779d62ffb -size 671662 +oid sha256:3992d7bd34e72089c5cffc4fc6de3f70a3995145b989811f83b00b47c96b5159 +size 681924 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp index 3056a533d6..d95d392d53 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e18db0cd4de65e76e30f219d24ec00095fb16005882c43322182c5fa3f59032 -size 445541 +oid sha256:521417177fc0447809c07ff86b58725fedbf1a6b9412ace4c50268a20bc2680d +size 447119 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp index 50d7f1bece..c405f483ae 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9aceb502c1a95f58f1eab515cf2aeac92be6d255ef405008a4fd871fd54e9ba6 +oid sha256:cb063c946558e6928faabb85df9775fecd2b9444b40b3e06cf0f863db80a5ad8 size 1242842 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp index 1a74df1288..e88a310b64 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec96248452f638bb9ca50d3630dd67caf71322c01b17aff301c4a98eb7e27974 -size 1215548 +oid sha256:31e6b7442b277f5206cc1d70fa6021f36170265b311106281e88b4611d1a5b6b +size 1220284 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp index e03f7c2575..0db1249a28 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dabc44860e81532e9b7ecb35773d0ad409d45361e20c9510d24387039999a7c3 -size 1720698 +oid sha256:c1342769efa91794d5bd35ac623b3014738b075b2671441668e2f0d5c1eef78a +size 1739642 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp index b1d87c1278..4d68087ca1 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d9c8d1fe282f46c12898ed4851a2640cb33ba5d75c5fe9da8a988f818a0e733 -size 407639 +oid sha256:a49dd8abcca57a64eb2ab4e00e4e0d26edf68488fb67086a4b466f8e6651522e +size 410007 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp index 2a12ddb711..deb498b1a2 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:849a280994b3fa1f18ca6c3866a16a68a9b02831f134f8dfcf0d34502c1d6772 +oid sha256:a7013b1eea12719ebeaf47facc37ef730bb0d6af03ca2ad890724a25448616a9 size 1102672 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp index a2c78e856d..4bf37280a0 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e209b01409585433406f8392c77a7398270ee1b58446b728cf74faa6fe1bf9a +oid sha256:a16aeaf5d11a4c25461452b5f3145136b31861ef9c443d7ec82066565275d6f8 size 629884 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp index 61bbc8d762..0115c2c36f 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a22bb0202916831eced0a44acbab769d5647937155e0a2b5e6d0d0cb83c726f -size 1122394 +oid sha256:a7d4526887fe860e0d9c482fc7fe2cfe646c7a20bc8a0813ce33a01fd9cc733c +size 1125550 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp index e0170f8db7..5d1d220755 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:582d17d48c7a751a345f74cc8c74f9b8c05278ddfc185da4906310a4973a9bdb -size 1547030 +oid sha256:b880e78ffc354edb541bd612e543dd894843fc4163f7bd65ce53282892381b8a +size 1566764 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp index 456d75f72f..fbab68022c 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70f02b7329eef7ceeb73dd43c3bf8f6ea6132c593bba6dbbed720d8b8ff0c287 +oid sha256:de26acaa532f197e339b6d5b2a2dd8032d505c9e169fce38000b02b2a4188eff size 603809 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp index 0c0712acaf..8315c08084 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f67d4e70c39bf379ed0f3ef73a3690ac64efaee1e7134c793a760924c270f046 +oid sha256:cef5bcfe63650bc924d9e45d2755b50940534999fb4fbad3a8abf0ba73b9245a size 329935 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp index f35d06ef06..c57602da24 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2c284c6cb66207bd204bd1b6abe45aa8bf2e0c92631681861df237b8f849a46 -size 363451 +oid sha256:b332d4c6047c98b504cd3be72cc5028d240621c8e0a3260d64c17804982104db +size 365029 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp index 73d9547cf2..a0fe210d9b 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3bede327d80be420e7bf011ee1a4156365afff7020bbf5a8434da18cb19fb23 -size 1093202 +oid sha256:a16c23767a2e5efbd7330728ed87af2ec62a7731debe1da557705c6db6d3268e +size 1096360 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp index 998e46d1f1..3c10c48136 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ee7695bd5bb0a03eafe29a497060d84caec96ca4d159e99e4f02b99977dd2a6 -size 1469690 +oid sha256:66950bc137b734d509f0574152bcf9cf7efcb17a7483450d5fdbf480e9f83001 +size 1486266 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp index a76bf3814f..0b4847611f 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cecca7ad5c652989a3008c8219177811ab9c7d617adbbc9ed8548141803c66f5 -size 694578 +oid sha256:bba586d9fe487c49cef2abfbfb0a078dde907d28e04b4d2335018cdb7031879c +size 701682 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp index 71a5743dd9..fb1751942e 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd6847c0e897eb794a9b1ff67e64358527fe64c3e01fc214545cf76ec60edc6d -size 644046 +oid sha256:d3e45ab30e471f4649807f5b7640512e2c6678cf623cadfcb26c93eb4ad60ec0 +size 654306 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp index ea50fb0631..ca8b31a010 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:118cc6d4a5e3e12ce0f2727361fd1d52d1a49c67d0bd1837c24e528c064a0dd7 -size 415557 +oid sha256:1932937b7f4ad0370341c77a03db133dd676bdf844b13eb45ec10243d1dfd16b +size 417135 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp index 285c32ec70..85d85fa4d9 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36d6c97af5fb15f32cd1ff13f53dd98a7d670cb80ee766765f42cc453f730812 -size 1195826 +oid sha256:c11f5d464b0486023b78babfdfe9d2768e4b0d13caeb436d6f73110ede72498c +size 1198982 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp index bd266daa63..465fcafece 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7775bbc1b43487236cf7570d2ed900f1c9830eab70aac1fa9dc59c439cc0c687 -size 1657562 +oid sha256:3bac9b40302bbfc6ee5a49e5c45d3238f46cff45619acd1b098d90e758d3ce30 +size 1675716 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp index 2d3c2887be..c65fa93d24 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:199b1ff3cc3d0ff04477ff8f1e6390dd62b3a7c9dd264cc73ce6c716af20a0f9 -size 366603 +oid sha256:26f09ab86b52c40b283652e555f677850f00902151d17e375e016b9a99a97794 +size 368183 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp index e0073c3730..36bdbdda6b 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e743b470f9607abcbc8b71e7ef67455e6104daf3a80d0bd012a96ecf90a8f18 +oid sha256:960c3f9e4fe46fc6390207ba0ed85ec25435045e2213b60c5d44ea9ab4fa56aa size 1128730 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp index 1553e77aee..58a89a84a2 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:366aa4e9f3263f73c4e76c0ea8008c0449b6d89bcade761500af949912786e32 +oid sha256:ac167d89ea3150f7b65614645ef09f13e2543bdc0523c1eddce5bbd9cfd306ee size 644892 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp index cd0531dde0..cd64d2fe38 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b8a8d76e17a24afd7af1dc5e112828f98ace78e3f85a7efaadb0cf1937085cc -size 1093198 +oid sha256:9d0cf59a8114940070448d87d02d9e83d53bb371ca9915c3983e03626d17024e +size 1097144 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp index 54fd20f69c..f3194ad186 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aeffa2db467fbae3ace85fae9f31e2b8a7c0923ab349ade42318ae6f55249ac8 -size 1462582 +oid sha256:ff1449b6795f5beda0b6a62e8a1171ce952b07c4e63b607c06f5fedddb2debe9 +size 1480736 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp index 673041f7af..87c5afddec 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffc92513e64631c33290f1e88e5666f5b85251506d527745c493f2e90da39de4 +oid sha256:cb14ae0271f8a83216f67c111530d3fe1be2231541ded5f992ff45226ae90e69 size 678808 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp index c39e7fa450..dad37ebd42 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:faad8cb1e44f5e16f61720966d2a6c9e782461c209cd8000263b50d42093444d +oid sha256:46a0d8e0a9495e03f72526b4ee04fa3d2a2d87984057b44550cabf4ffa745ef4 size 370201 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index e2ee736b49..0000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dd930ed415b0303a973a37550ee33fa4975ad6be0cc58d461370b127f9a90f8e -size 1020542 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 95d9b2bf64..0000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4f2b243127e1ce00a850a10cca104ffc42512711f434fbdf8683eeeb49b8ce42 -size 1056062 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_32_ldgsts_sm90.cubin.cpp deleted file mode 100644 index 0c093db643..0000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_32_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2ce9cc89b1db7f7e4b76b94cf1c3b04db49a2d86b529b1fc85b19057a99bc9fa -size 1007924 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_64_ldgsts_sm90.cubin.cpp deleted file mode 100644 index c24e239dd0..0000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_64_ldgsts_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e176513fa0074d688620299dfca53adc3902491e97ea9b6938a4ceb2fcf17ef5 -size 1068702 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp index a0197d8083..29c9eea339 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp @@ -238,6 +238,9 @@ void FusedMHARunnerV2::setupKernelParams(MHARunnerParams runnerParams) mKernelParams.packed_mask_ptr = runnerParams.packedMaskPtr; mKernelParams.cu_mask_rows = reinterpret_cast(runnerParams.cuMaskRowsPtr); } + TLLM_CHECK_WITH_INFO( + runnerParams.attentionSinksPtr == nullptr || mSM == kSM_90, "The attention sinks is only supported on SM90."); + mKernelParams.attention_sinks_ptr = runnerParams.attentionSinksPtr; mKernelParams.cu_q_seqlens = reinterpret_cast(runnerParams.cuQSeqLenPtr); mKernelParams.tile_id_counter_ptr = reinterpret_cast(runnerParams.tileCounterPtr); // TRT doesn't support host scales. Use device scales instead. diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h index 96435cca52..e909886616 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h @@ -263,6 +263,8 @@ struct MHARunnerParams void* outputSfPtr; // The softmax_status ptr for RingAttention. void* softmaxStatsPtr; + // The attention sinks ptr. + float const* attentionSinksPtr; // The packed mask ptr. void const* packedMaskPtr; // The cumulative Q sequence lengths. @@ -352,6 +354,8 @@ struct Fused_multihead_attention_params_v2 KVBlockArrayForContextFMHA paged_kv_cache; // The mask to implement drop-out. void const* packed_mask_ptr; + // The attention sinks. + float const* attention_sinks_ptr; // The O matrix (output). void* o_ptr; // The Softmax stats vector of layout [2, B, S, H], including softmax_sum and softmax_max diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt index 7a02cdee73..fd89ae4a19 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt @@ -218,6 +218,11 @@ if(USING_OSS_CUTLASS_MOE_GEMM) set(MOE_GEMM_SRC_CU_LAUNCHER ${MOE_GEMM_SRC_CU}) list(FILTER MOE_GEMM_SRC_CU_LAUNCHER EXCLUDE REGEX ".*moe_gemm_kernels_.*") list(FILTER MOE_GEMM_SRC_CU INCLUDE REGEX ".*moe_gemm_kernels_.*") + set(MOE_GEMM_SRC_CU_HOPPER_FP4 ${MOE_GEMM_SRC_CU}) + list(FILTER MOE_GEMM_SRC_CU_HOPPER_FP4 INCLUDE REGEX + ".*moe_gemm_kernels_(bf16|fp16)_fp4.*") + list(FILTER MOE_GEMM_SRC_CU EXCLUDE REGEX + ".*moe_gemm_kernels_(bf16|fp16)_fp4.*") set(MOE_GEMM_SRC_CU_FP4 ${MOE_GEMM_SRC_CU}) list(FILTER MOE_GEMM_SRC_CU_FP4 INCLUDE REGEX ".*fp4.*") list(FILTER MOE_GEMM_SRC_CU EXCLUDE REGEX ".*fp4.*") @@ -230,6 +235,10 @@ if(USING_OSS_CUTLASS_MOE_GEMM) add_library(_moe_gemm_launcher OBJECT ${MOE_GEMM_SRC_CU_LAUNCHER}) add_cuda_architectures(_moe_gemm_launcher 89) + add_library(_moe_gemm_hopper_fp4 OBJECT ${MOE_GEMM_SRC_CU_HOPPER_FP4}) + set_cuda_architectures(_moe_gemm_hopper_fp4 90) + process_target(_moe_gemm_hopper_fp4 true false) + add_library(_moe_gemm_fp4 OBJECT ${MOE_GEMM_SRC_CU_FP4}) set_cuda_architectures(_moe_gemm_fp4 100f 120f) process_target(_moe_gemm_fp4 false true) @@ -239,8 +248,9 @@ if(USING_OSS_CUTLASS_MOE_GEMM) process_target(_moe_gemm_fp8 true true) add_instantiations(moe_gemm_src ${INSTANTIATION_GENERATION_DIR}/gemm_grouped) - target_link_libraries(moe_gemm_src PRIVATE _moe_gemm_launcher _moe_gemm_fp4 - _moe_gemm_fp8) + target_link_libraries( + moe_gemm_src PRIVATE _moe_gemm_launcher _moe_gemm_hopper_fp4 _moe_gemm_fp4 + _moe_gemm_fp8) target_include_directories( moe_gemm_src PUBLIC ${PROJECT_SOURCE_DIR}/tensorrt_llm/cutlass_extensions/include) diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h index e6c3a6bbfa..646be2575c 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h @@ -27,6 +27,7 @@ enum class ActivationType Silu, Swiglu, Geglu, + SwigluBias, Identity, InvalidType }; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h index 7ddd756e0d..1237884d13 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h @@ -210,8 +210,10 @@ struct TmaWarpSpecializedGroupedGemmInput struct INT4GroupwiseParams { - constexpr static int group_size = 128; // Unused, hard-coded to 128 + constexpr static int int4_group_size = 128; + constexpr static int wfp4a16_group_size = 32; bool enabled = false; + bool use_wfp4a16 = false; using SFA = __nv_bfloat16; using SFB = __nv_bfloat16; // Unused using ProblemShapeInt = cutlass::gemm::GroupProblemShape>; @@ -254,7 +256,8 @@ struct TmaWarpSpecializedGroupedGemmInput constexpr bool isGatedActivation(ActivationType activation_type) { - return activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu; + return activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu + || activation_type == ActivationType::SwigluBias; } template && (std::is_same_v || std::is_same_v); +#else + static constexpr bool use_wfp4a16 = std::is_same_v && std::is_same_v; +#endif #if defined(ENABLE_FP8) static constexpr bool use_fp8 = (std::is_same_v @@ -282,6 +291,7 @@ public: static constexpr bool use_w4afp8 = false; static constexpr bool use_wfp4afp4 = false; #endif + static constexpr bool use_w4_groupwise = use_w4afp8 || use_wfp4a16; #if defined(ENABLE_FP4) static constexpr bool use_fp4 = std::is_same_v; @@ -306,9 +316,9 @@ public: [[nodiscard]] bool isTmaWarpSpecialized(cutlass_extensions::CutlassGemmConfig gemm_config) const; [[nodiscard]] bool supportsTmaWarpSpecialized() const; - [[nodiscard]] bool isFusedGatedActivation( - cutlass_extensions::CutlassGemmConfig gemm_config, bool is_gated_activation, int gemm_n, int gemm_k) const; - [[nodiscard]] bool supportsFusedGatedActivation(bool is_gated_activation, int gemm_n, int gemm_k) const; + [[nodiscard]] bool isFusedGatedActivation(cutlass_extensions::CutlassGemmConfig gemm_config, + ActivationType activation_type, int gemm_n, int gemm_k) const; + [[nodiscard]] bool supportsFusedGatedActivation(ActivationType activation_type, int gemm_n, int gemm_k) const; size_t getMaxWorkspaceSize(int num_experts) const; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h index c7c9a55b95..2c816e9a34 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h @@ -87,6 +87,62 @@ struct LoraParams namespace cutlass_kernels { +static inline size_t pad_to_multiple_of_16(size_t const& input) +{ + static constexpr int ALIGNMENT = 16; + return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT); +} + +class CubKeyValueSorter +{ +public: + CubKeyValueSorter(); + + CubKeyValueSorter(int const num_experts_per_node); + + void updateNumExperts(int const num_experts_per_node); + + static size_t getWorkspaceSize(size_t const num_key_value_pairs, int const num_experts_per_node); + + void run(void* workspace, size_t const workspace_size, int const* keys_in, int* keys_out, int const* values_in, + int* values_out, size_t const num_key_value_pairs, cudaStream_t stream); + +private: + static int expertsToBits(int experts); + int num_experts_; + int num_bits_; +}; + +struct ActivationParams +{ + ActivationType activation_type; + float const* swiglu_alpha = nullptr; + float const* swiglu_beta = nullptr; + float const* swiglu_limit = nullptr; + + explicit ActivationParams(ActivationType activation_type) + : activation_type(activation_type) + { + TLLM_CHECK_WITH_INFO(activation_type != ActivationType::SwigluBias, + "SwigluBias is not supported in ActivationParams without swiglu_alpha and swiglu_beta"); + } + + ActivationParams( + ActivationType activation_type, float const* swiglu_alpha, float const* swiglu_beta, float const* swiglu_limit) + : activation_type(activation_type) + , swiglu_alpha(swiglu_alpha) + , swiglu_beta(swiglu_beta) + , swiglu_limit(swiglu_limit) + { + } + + // TODO Port everything properly and get rid of these implicit conversions + operator ActivationType() const + { + return activation_type; + } +}; + /** * \brief Describes what parallelism mode the MoE is using * @@ -394,7 +450,7 @@ public: virtual void runMoe(void const* input_activations, void const* input_sf, int const* token_selected_experts, float const* token_final_scales, void const* fc1_expert_weights, void const* fc1_expert_biases, - ActivationType fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases, + ActivationParams fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases, QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size, int const num_experts, int const experts_per_token, char* workspace_ptr, void* final_output, int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config, bool const enable_alltoall, @@ -410,7 +466,7 @@ public: float const* const fc2_fp8_quant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params, int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size, - int const num_experts_per_node, ActivationType fc1_activation_type, float const** alpha_scale_ptr_array, + int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids) @@ -474,6 +530,13 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface = tensorrt_llm::kernels::fp8_blockscale_gemm::CutlassFp8BlockScaleGemmRunnerInterface; using ScaleBiasType = BackBoneType; using Self = CutlassMoeFCRunner; + +#if defined(ENABLE_BF16) + static constexpr bool use_wfp4a16 + = std::is_same_v && (std::is_same_v || std::is_same_v); +#else + static constexpr bool use_wfp4a16 = std::is_same_v && std::is_same_v; +#endif #if defined(ENABLE_FP8) static constexpr bool use_fp8 = (std::is_same_v || std::is_same_v) &&!std::is_same_v; @@ -485,6 +548,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface static constexpr bool use_fp8 = false; static constexpr bool use_w4afp8 = false; #endif + static constexpr bool use_w4_groupwise = use_w4afp8 || use_wfp4a16; #if defined(ENABLE_FP4) static constexpr bool act_fp4 = std::is_same_v; static constexpr bool weight_fp4 = std::is_same_v; @@ -541,7 +605,7 @@ public: void runMoe(void const* input_activations, void const* input_sf, int const* token_selected_experts, float const* token_final_scales, void const* fc1_expert_weights, void const* fc1_expert_biases, - ActivationType fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases, + ActivationParams fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases, QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size, int const num_experts, int const experts_per_token, char* workspace_ptr, void* final_output, int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config, bool const enable_alltoall, @@ -563,7 +627,7 @@ public: TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params, int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size, - int const num_experts_per_node, ActivationType fc1_activation_type, float const** alpha_scale_ptr_array, + int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, bool bias_is_broadcast, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids); @@ -591,7 +655,7 @@ public: float const* const fc2_fp8_quant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params, int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size, - int const num_experts_per_node, ActivationType fc1_activation_type, float const** alpha_scale_ptr_array, + int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids) override @@ -679,7 +743,7 @@ public: private: std::pair setupTmaWarpSpecializedInputs( - int64_t num_rows, int64_t expanded_num_rows, ActivationType fc1_activation_type, int64_t hidden_size, + int64_t num_rows, int64_t expanded_num_rows, ActivationParams fc1_activation_type, int64_t hidden_size, int64_t inter_size, int64_t num_experts_per_node, void const* input_activations_void, TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, void* final_output, WeightType const* fc1_expert_weights, WeightType const* fc2_expert_weights, QuantParams quant_params, @@ -727,7 +791,7 @@ private: bool mayHaveFinalizeFused() const { return moe_gemm_runner_.supportsTmaWarpSpecialized() && moe_gemm_runner_.getSM() == 90 - && !use_deterministic_hopper_reduce_ && !use_w4afp8; + && !use_deterministic_hopper_reduce_ && !use_w4_groupwise; } // TODO: This should eventually take the quant params to give more flexibility @@ -758,7 +822,7 @@ private: WeightType const* const fc1_expert_weights, ScaleBiasType const* const fc1_expert_biases, float const* const fc2_fp8_quant, int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, - ActivationType fc1_activation_type, QuantParams& quant_params, cudaStream_t stream); + ActivationParams fc1_activation_type, QuantParams& quant_params, cudaStream_t stream); static void BlockScaleFC2(DeepSeekBlockScaleGemmRunner& gemm_runner, T const* const input, void* const gemm_output, OutputType* const final_output, int64_t const* const expert_first_token_offset, diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl index a0ebfbde34..651b7f1406 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl @@ -85,15 +85,14 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput::type; - using ElementA = cutlass::float_e4m3_t; + using ElementA = typename TllmToCutlassTypeAdapter::type; using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; // Alignment of A matrix in units of elements (up to 16 bytes) // B matrix configuration - // using ElementB = typename TllmToCutlassTypeAdapter::type; - using ElementB = typename cutlass::int4b_t; + using ElementB_ = typename TllmToCutlassTypeAdapter::type; + using ElementB = std::conditional_t, cutlass::int4b_t, ElementB_>; using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; // Memory access granularity/alignment of B matrix in units of @@ -108,9 +107,13 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput>; // Scale configuration - constexpr int PackedScalesNum = get<2>(CTAShape{}) / 128; - using ElementScalePacked - = cutlass::Array; + constexpr bool use_wfp4a16 = std::is_same_v; + constexpr int group_size = use_wfp4a16 ? cutlass::gemm::collective::detail::mxfp4_group_size + : cutlass::gemm::collective::detail::int4_group_size; + constexpr int PackedScalesNum = get<2>(CTAShape{}) / group_size; + using ElementScale = std::conditional_t; + using ElementScalePacked = cutlass::Array; using LayoutScale = cutlass::layout::RowMajor; // C/D matrix configuration @@ -170,20 +173,21 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput(hopper_inputs.ptr_b), hopper_inputs.stride_b, reinterpret_cast(hopper_inputs.ptr_a), hopper_inputs.stride_a, reinterpret_cast(hopper_inputs.int4_groupwise_params.ptr_s_a), - hopper_inputs.int4_groupwise_params.stride_s_a, int(inputs.groupwise_quant_group_size)}, + hopper_inputs.int4_groupwise_params.stride_s_a, group_size}, {fusion_args, reinterpret_cast(hopper_inputs.ptr_c), hopper_inputs.stride_c, reinterpret_cast(hopper_inputs.default_epilogue.ptr_d), hopper_inputs.default_epilogue.stride_d}, @@ -205,7 +209,7 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput(hopper_inputs.ptr_b), hopper_inputs.stride_b, reinterpret_cast(hopper_inputs.ptr_a), hopper_inputs.stride_a, reinterpret_cast(hopper_inputs.int4_groupwise_params.ptr_s_a), - hopper_inputs.int4_groupwise_params.stride_s_a, int(inputs.groupwise_quant_group_size)}, + hopper_inputs.int4_groupwise_params.stride_s_a, group_size}, {fusion_args, reinterpret_cast(hopper_inputs.ptr_c), hopper_inputs.stride_c, reinterpret_cast(hopper_inputs.default_epilogue.ptr_d), hopper_inputs.default_epilogue.stride_d}, diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu new file mode 100644 index 0000000000..be29019bc6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "moe_gemm_template_dispatch.h" + +namespace tensorrt_llm::kernels::cutlass_kernels +{ +#ifdef ENABLE_BF16 +template class MoeGemmRunner<__nv_bfloat16, __nv_fp4_e2m1, __nv_bfloat16>; +#endif +} // namespace tensorrt_llm::kernels::cutlass_kernels diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu new file mode 100644 index 0000000000..f1a885ea77 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "moe_gemm_template_dispatch.h" + +namespace tensorrt_llm::kernels::cutlass_kernels +{ +template class MoeGemmRunner; +} diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h index ff582ec6e6..56a8299f18 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h @@ -99,6 +99,7 @@ struct genericMoeGemmKernelLauncher static_assert(cutlass::platform::is_same::value || cutlass::platform::is_same::value + || cutlass::platform::is_same::value || cutlass::platform::is_same::value); static_assert(arch::kMinComputeCapability < 90, "Sm90+ architecture should use specialized kernels"); @@ -503,7 +504,8 @@ MoeGemmRunner::getAmpereConfigs(int sm auto config_type_param = static_cast( weight_only_flag | simt_only_flag | grouped_gemm_flag | enable_hopper | fp8_only_flag); - if (!kernels::cutlass_kernels::isValidAmpereMOESpecialisation() || (use_w4afp8 && sm != 89)) + if (!kernels::cutlass_kernels::isValidAmpereMOESpecialisation() || (use_w4afp8 && sm != 89) + || use_wfp4a16) { return {}; } @@ -580,18 +582,19 @@ int MoeGemmRunner::getSM() const // currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction template bool MoeGemmRunner::supportsFusedGatedActivation( - bool is_gated_activation, int gemm_n, int gemm_k) const + ActivationType activation_type, int gemm_n, int gemm_k) const { constexpr bool ENABLE_FUSED_GATED_ACTIVATION = true; - return is_gated_activation && std::is_same_v && !std::is_same_v && !use_fp8 - && (this->getSM() >= 80) && (gemm_k % 64 == 0) && (gemm_n % 64 == 0) && ENABLE_FUSED_GATED_ACTIVATION; + return (activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu) + && std::is_same_v && !std::is_same_v && !use_fp8 && (this->getSM() >= 80) + && (gemm_k % 64 == 0) && (gemm_n % 64 == 0) && ENABLE_FUSED_GATED_ACTIVATION; } template bool MoeGemmRunner::isFusedGatedActivation( - cutlass_extensions::CutlassGemmConfig gemm_config, bool is_gated_activation, int gemm_n, int gemm_k) const + cutlass_extensions::CutlassGemmConfig gemm_config, ActivationType activation_type, int gemm_n, int gemm_k) const { - return supportsFusedGatedActivation(is_gated_activation, gemm_n, gemm_k) && !gemm_config.is_tma_warp_specialized; + return supportsFusedGatedActivation(activation_type, gemm_n, gemm_k) && !gemm_config.is_tma_warp_specialized; } template @@ -623,26 +626,41 @@ void MoeGemmRunner::dispatchToArch( if (sm_ >= 75 && sm_ < 80) { - dispatchMoeGemmToCutlass( - inputs, multi_processor_count_); - } - else if (sm_ >= 80 && sm_ < 90) - { - if constexpr (use_fp8 || use_w4afp8) + if constexpr (!std::is_same_v) { -#if defined(ENABLE_FP8) - static_assert(!std::is_same_v && !std::is_same_v, - "FP8 GEMM Output not supported"); -#endif - - TLLM_CHECK_WITH_INFO(sm_ == 89, "For sm >= 80 and < 90, fp8 is only supported with sm == 89"); - dispatchMoeGemmToCutlass( + dispatchMoeGemmToCutlass( inputs, multi_processor_count_); } else { - dispatchMoeGemmToCutlass( - inputs, multi_processor_count_); + TLLM_THROW("FP4 data type is not supported on SM < 90"); + } + } + else if (sm_ >= 80 && sm_ < 90) + { + + if constexpr (!std::is_same_v) + { + if constexpr (use_fp8 || use_w4afp8) + { +#if defined(ENABLE_FP8) + static_assert(!std::is_same_v && !std::is_same_v, + "FP8 GEMM Output not supported"); +#endif + + TLLM_CHECK_WITH_INFO(sm_ == 89, "For sm >= 80 and < 90, fp8 is only supported with sm == 89"); + dispatchMoeGemmToCutlass( + inputs, multi_processor_count_); + } + else + { + dispatchMoeGemmToCutlass( + inputs, multi_processor_count_); + } + } + else + { + TLLM_THROW("FP4 data type is not supported on SM < 90"); } } else if (sm_ >= 90) @@ -659,7 +677,7 @@ void MoeGemmRunner::dispatchToArch( } if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation() - && !use_w4afp8) + && !use_w4_groupwise) { // We allow both tma warp specialized and SM80 configurations to coexist because for some cases with small // numbers of tokens SM80 is faster. We check here to see which is selected @@ -701,33 +719,39 @@ void MoeGemmRunner::dispatchToArch( // Hopper finegrained INT4 WS grouped GEMM if constexpr (use_w4afp8) { - if (inputs.gemm_config.is_tma_warp_specialized) + TLLM_CHECK_WITH_INFO( + inputs.gemm_config.is_tma_warp_specialized, "w4afp8 is only supported for TMA warp specialization"); + // EpilogueTag is ignored + if (inputs.k % 512 == 0) { - // EpilogueTag is ignored - if (inputs.k % 512 == 0) - { - sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass( - inputs, hopper_inputs, multi_processor_count_, nullptr); - } - else if (inputs.k % 256 == 0) - { - sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass( - inputs, hopper_inputs, multi_processor_count_, nullptr); - } - else if (inputs.k % 128 == 0) - { - sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass( - inputs, hopper_inputs, multi_processor_count_, nullptr); - } - else - { - TLLM_THROW("Invalid GEMM K size %d", (int) inputs.k); - } - return; - }; + sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(inputs, hopper_inputs, multi_processor_count_, nullptr); + } + else if (inputs.k % 256 == 0) + { + sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(inputs, hopper_inputs, multi_processor_count_, nullptr); + } + else if (inputs.k % 128 == 0) + { + sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(inputs, hopper_inputs, multi_processor_count_, nullptr); + } + else + { + TLLM_THROW("Invalid GEMM K size %d", (int) inputs.k); + } + return; + } + + if constexpr (use_wfp4a16) + { + TLLM_CHECK_WITH_INFO( + inputs.gemm_config.is_tma_warp_specialized, "wfp4a16 is only supported for TMA warp specialization"); + // EpilogueTag is ignored + sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(inputs, hopper_inputs, multi_processor_count_, nullptr); + return; } #endif @@ -779,7 +803,7 @@ size_t MoeGemmRunner::getMaxWorkspaceS template size_t MoeGemmRunner::calcMaxWorkspaceSize(int num_experts) const { - if constexpr (use_w4afp8) + if constexpr (use_w4_groupwise) { return calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput( num_experts, multi_processor_count_); @@ -788,7 +812,8 @@ size_t MoeGemmRunner::calcMaxWorkspace { return 0; } - if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation() && !use_w4afp8) + if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation() && !use_w4afp8 + && !use_wfp4a16) { auto configs = getTmaWarpSpecializedConfigs(sm_); auto fpX_block_scaling_type = TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NONE; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h index 9a9f2ebeb3..affa4d8c40 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h @@ -153,10 +153,13 @@ void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass( // We also only instantiate configs here where threadblockShapeM == warpShapeM since those usually perform the best // for mixed type gemms. - constexpr int Ktile = 128 * PackedScalesNum / sizeof(T); - TLLM_CHECK(sizeof(T) == 1); + constexpr int Ntile = (std::is_same_v) ? 64 : 128; + constexpr int Ktile = (std::is_same_v) ? 128 : 128 * PackedScalesNum / sizeof(T); + TLLM_CHECK(sizeof(T) == (std::is_same_v) ? 2 : 1); + using _Ntile = Int; using _Ktile = Int; + switch (inputs.gemm_config.tile_config_sm90) { case tkc::CutlassTileConfigSM90::CtaShape64x16x128B: @@ -172,8 +175,8 @@ void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass( inputs, hopper_inputs, sm_count_, workspace_size); break; case tkc::CutlassTileConfigSM90::CtaShape64x128x128B: - sm90_dispatch_moe_mixed_dtype_gemm_config>( - inputs, hopper_inputs, sm_count_, workspace_size); + sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; // case tkc::CutlassTileConfigSM90::CtaShape64x256x128B: // sm90_dispatch_moe_mixed_dtype_gemm_config size_t calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput(int num_experts, int sm_count_) { size_t count = 0; + constexpr int Ktile = (std::is_same_v) ? 256 : 512; + using _Ktile = Int; + #ifdef COMPILE_HOPPER_TMA_GROUPED_GEMMS GroupedGemmInput inputs{}; inputs.num_experts = num_experts; sm90_generic_mixed_moe_gemm_kernelLauncher, Shape<_1, _1, _1>, + tensorrt_llm::cutlass_extensions::EpilogueOpDefault, Shape<_128, _64, _Ktile>, Shape<_1, _1, _1>, cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative, cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>( inputs, TmaWarpSpecializedGroupedGemmInput{}, sm_count_, &count); diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu index 0caf687b56..ab3cf2a5b8 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu @@ -997,12 +997,12 @@ __device__ auto quantizePackedFPXValue(ComputeElem& post_act_val, float global_s TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType scaling_type) { constexpr bool is_fp8 = std::is_same_v; - static constexpr int NumThreadsPerSF = VecSize / CVT_FP4_ELTS_PER_THREAD; + static constexpr int NumThreadsPerSF = VecSize / CVT_ELTS_PER_THREAD; // Quantize the input to FP4 static_assert(std::is_same_v || std::is_same_v); - static_assert(ComputeElem::kElements == CVT_FP4_ELTS_PER_THREAD); + static_assert(ComputeElem::kElements == CVT_ELTS_PER_THREAD); PackedVec packed_vec{}; - for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) + for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++) { packed_vec.elts[i].x = static_cast(post_act_val[i * 2 + 0]); packed_vec.elts[i].y = static_cast(post_act_val[i * 2 + 1]); @@ -1013,10 +1013,9 @@ __device__ auto quantizePackedFPXValue(ComputeElem& post_act_val, float global_s = act_sf_flat + getOffsetActivationSF(expert_id, num_tokens_before_expert, num_cols, scaling_type); // Use `token - num_tokens_before_expert` because we want this to be relative to the start of this expert - auto sf_out - = cvt_quant_to_fp4_get_sf_out_offset( - std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx, std::nullopt /* numRows */, - num_cols, act_sf_expert, FP4QuantizationSFLayout::SWIZZLED); + auto sf_out = cvt_quant_get_sf_out_offset( + std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx, std::nullopt /* numRows */, + num_cols / VecSize, act_sf_expert, QuantizationSFLayout::SWIZZLED); // Do the conversion and set the output and scaling factor auto func = [&]() @@ -1055,19 +1054,18 @@ __device__ void writeSF(int64_t num_tokens_before_expert, int64_t expert_id, int : TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX); // Use `token - num_tokens_before_expert` because we want this to be relative to the start of this expert - auto sf_out - = cvt_quant_to_fp4_get_sf_out_offset( - std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx, std::nullopt /* numRows */, - num_cols, act_sf_expert, FP4QuantizationSFLayout::SWIZZLED); + auto sf_out = cvt_quant_get_sf_out_offset( + std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx, std::nullopt /* numRows */, + num_cols / VecSize, act_sf_expert, QuantizationSFLayout::SWIZZLED); if (sf_out) { if (input_sf) { auto const sf_in - = cvt_quant_to_fp4_get_sf_out_offset(std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */, - num_cols, const_cast(input_sf), - FP4QuantizationSFLayout::SWIZZLED); + = cvt_quant_get_sf_out_offset( + std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */, + num_cols / VecSize, const_cast(input_sf), + QuantizationSFLayout::SWIZZLED); *sf_out = *sf_in; } else @@ -1162,7 +1160,12 @@ __device__ void computeTmaWarpSpecializedInputStrides( { layout_info.int4_groupwise_params.stride_s_a[out_idx] = cutlass::make_cute_packed_stride(TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::StrideSFA{}, - cute::make_shape(gemm_n, gemm_k / 128, 1)); + cute::make_shape(gemm_n, + gemm_k + / (layout_info.int4_groupwise_params.use_wfp4a16 + ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size + : TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size), + 1)); } } @@ -1185,8 +1188,13 @@ __device__ void computeTmaWarpSpecializedInputPointers(TmaWarpSpecializedGrouped } if (layout_info.int4_groupwise_params.enabled) { - layout_info.int4_groupwise_params.ptr_s_a[out_idx] - = safe_inc_ptr(w4a8_weight_scale, expert * (gemm_n * gemm_k / 128)); + // The group size of wfp4a16 is multiplied by 2 because each scale uses 1 byte instead of 2 bytes + layout_info.int4_groupwise_params.ptr_s_a[out_idx] = safe_inc_ptr(w4a8_weight_scale, + expert + * (gemm_n * gemm_k + / (layout_info.int4_groupwise_params.use_wfp4a16 + ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size * 2 + : TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size))); } } @@ -1487,7 +1495,7 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp : TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize; constexpr int64_t ELEM_PER_THREAD - = (is_nvfp4 || is_mxfp8) ? CVT_FP4_ELTS_PER_THREAD : (128 / sizeof_bits::value); + = (is_nvfp4 || is_mxfp8) ? CVT_ELTS_PER_THREAD : (128 / sizeof_bits::value); // This should be VecSize * 4 elements // We assume at least VecSize alignment or the quantization will fail @@ -2007,16 +2015,67 @@ INSTANTIATE_FINALIZE_MOE_ROUTING(float, float, float); INSTANTIATE_FINALIZE_MOE_ROUTING(__nv_bfloat16, __nv_bfloat16, __nv_bfloat16); #endif +// ============================== Activation Adaptors ================================= +template